LLVM 22.0.0git
AMDGPUISelLowering.cpp
Go to the documentation of this file.
1//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This is the parent TargetLowering class for hardware code gen
11/// targets.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPUISelLowering.h"
16#include "AMDGPU.h"
17#include "AMDGPUInstrInfo.h"
19#include "AMDGPUMemoryUtils.h"
25#include "llvm/IR/IntrinsicsAMDGPU.h"
29
30using namespace llvm;
31
32#include "AMDGPUGenCallingConv.inc"
33
35 "amdgpu-bypass-slow-div",
36 cl::desc("Skip 64-bit divide for dynamic 32-bit values"),
37 cl::init(true));
38
39// Find a larger type to do a load / store of a vector with.
41 unsigned StoreSize = VT.getStoreSizeInBits();
42 if (StoreSize <= 32)
43 return EVT::getIntegerVT(Ctx, StoreSize);
44
45 if (StoreSize % 32 == 0)
46 return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
47
48 return VT;
49}
50
53}
54
56 // In order for this to be a signed 24-bit value, bit 23, must
57 // be a sign bit.
58 return DAG.ComputeMaxSignificantBits(Op);
59}
60
62 const AMDGPUSubtarget &STI)
63 : TargetLowering(TM), Subtarget(&STI) {
64 // Always lower memset, memcpy, and memmove intrinsics to load/store
65 // instructions, rather then generating calls to memset, mempcy or memmove.
69
70 // Enable ganging up loads and stores in the memcpy DAG lowering.
72
73 // Lower floating point store/load to integer store/load to reduce the number
74 // of patterns in tablegen.
76 AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
77
79 AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
80
82 AddPromotedToType(ISD::LOAD, MVT::v3f32, MVT::v3i32);
83
85 AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
86
88 AddPromotedToType(ISD::LOAD, MVT::v5f32, MVT::v5i32);
89
91 AddPromotedToType(ISD::LOAD, MVT::v6f32, MVT::v6i32);
92
94 AddPromotedToType(ISD::LOAD, MVT::v7f32, MVT::v7i32);
95
97 AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
98
100 AddPromotedToType(ISD::LOAD, MVT::v9f32, MVT::v9i32);
101
102 setOperationAction(ISD::LOAD, MVT::v10f32, Promote);
103 AddPromotedToType(ISD::LOAD, MVT::v10f32, MVT::v10i32);
104
105 setOperationAction(ISD::LOAD, MVT::v11f32, Promote);
106 AddPromotedToType(ISD::LOAD, MVT::v11f32, MVT::v11i32);
107
108 setOperationAction(ISD::LOAD, MVT::v12f32, Promote);
109 AddPromotedToType(ISD::LOAD, MVT::v12f32, MVT::v12i32);
110
111 setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
112 AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
113
114 setOperationAction(ISD::LOAD, MVT::v32f32, Promote);
115 AddPromotedToType(ISD::LOAD, MVT::v32f32, MVT::v32i32);
116
118 AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
119
121 AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);
122
124 AddPromotedToType(ISD::LOAD, MVT::f64, MVT::v2i32);
125
127 AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32);
128
130 AddPromotedToType(ISD::LOAD, MVT::v3i64, MVT::v6i32);
131
133 AddPromotedToType(ISD::LOAD, MVT::v4i64, MVT::v8i32);
134
136 AddPromotedToType(ISD::LOAD, MVT::v3f64, MVT::v6i32);
137
139 AddPromotedToType(ISD::LOAD, MVT::v4f64, MVT::v8i32);
140
142 AddPromotedToType(ISD::LOAD, MVT::v8i64, MVT::v16i32);
143
145 AddPromotedToType(ISD::LOAD, MVT::v8f64, MVT::v16i32);
146
147 setOperationAction(ISD::LOAD, MVT::v16i64, Promote);
148 AddPromotedToType(ISD::LOAD, MVT::v16i64, MVT::v32i32);
149
150 setOperationAction(ISD::LOAD, MVT::v16f64, Promote);
151 AddPromotedToType(ISD::LOAD, MVT::v16f64, MVT::v32i32);
152
154 AddPromotedToType(ISD::LOAD, MVT::i128, MVT::v4i32);
155
156 // TODO: Would be better to consume as directly legal
158 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f32, MVT::i32);
159
161 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f64, MVT::i64);
162
164 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f16, MVT::i16);
165
167 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::bf16, MVT::i16);
168
170 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f32, MVT::i32);
171
173 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f64, MVT::i64);
174
176 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f16, MVT::i16);
177
179 AddPromotedToType(ISD::ATOMIC_STORE, MVT::bf16, MVT::i16);
180
181 // There are no 64-bit extloads. These should be done as a 32-bit extload and
182 // an extension to 64-bit.
183 for (MVT VT : MVT::integer_valuetypes())
185 Expand);
186
187 for (MVT VT : MVT::integer_valuetypes()) {
188 if (VT == MVT::i64)
189 continue;
190
191 for (auto Op : {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}) {
192 setLoadExtAction(Op, VT, MVT::i1, Promote);
193 setLoadExtAction(Op, VT, MVT::i8, Legal);
194 setLoadExtAction(Op, VT, MVT::i16, Legal);
195 setLoadExtAction(Op, VT, MVT::i32, Expand);
196 }
197 }
198
200 for (auto MemVT :
201 {MVT::v2i8, MVT::v4i8, MVT::v2i16, MVT::v3i16, MVT::v4i16})
203 Expand);
204
205 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
206 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand);
207 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
208 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2bf16, Expand);
209 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3f16, Expand);
210 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3bf16, Expand);
211 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
212 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4bf16, Expand);
213 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
214 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8bf16, Expand);
215 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Expand);
216 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16bf16, Expand);
217 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32f16, Expand);
218 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32bf16, Expand);
219
220 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
221 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
222 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f32, Expand);
223 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
224 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand);
225 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f32, Expand);
226
227 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
228 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand);
229 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
230 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2bf16, Expand);
231 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f16, Expand);
232 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3bf16, Expand);
233 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
234 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4bf16, Expand);
235 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
236 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8bf16, Expand);
237 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f16, Expand);
238 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16bf16, Expand);
239
241 AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
242
244 AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
245
247 AddPromotedToType(ISD::STORE, MVT::v3f32, MVT::v3i32);
248
250 AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
251
253 AddPromotedToType(ISD::STORE, MVT::v5f32, MVT::v5i32);
254
256 AddPromotedToType(ISD::STORE, MVT::v6f32, MVT::v6i32);
257
259 AddPromotedToType(ISD::STORE, MVT::v7f32, MVT::v7i32);
260
262 AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
263
265 AddPromotedToType(ISD::STORE, MVT::v9f32, MVT::v9i32);
266
268 AddPromotedToType(ISD::STORE, MVT::v10f32, MVT::v10i32);
269
271 AddPromotedToType(ISD::STORE, MVT::v11f32, MVT::v11i32);
272
274 AddPromotedToType(ISD::STORE, MVT::v12f32, MVT::v12i32);
275
277 AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
278
280 AddPromotedToType(ISD::STORE, MVT::v32f32, MVT::v32i32);
281
283 AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
284
286 AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);
287
289 AddPromotedToType(ISD::STORE, MVT::f64, MVT::v2i32);
290
292 AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
293
295 AddPromotedToType(ISD::STORE, MVT::v3i64, MVT::v6i32);
296
298 AddPromotedToType(ISD::STORE, MVT::v3f64, MVT::v6i32);
299
301 AddPromotedToType(ISD::STORE, MVT::v4i64, MVT::v8i32);
302
304 AddPromotedToType(ISD::STORE, MVT::v4f64, MVT::v8i32);
305
307 AddPromotedToType(ISD::STORE, MVT::v8i64, MVT::v16i32);
308
310 AddPromotedToType(ISD::STORE, MVT::v8f64, MVT::v16i32);
311
313 AddPromotedToType(ISD::STORE, MVT::v16i64, MVT::v32i32);
314
316 AddPromotedToType(ISD::STORE, MVT::v16f64, MVT::v32i32);
317
319 AddPromotedToType(ISD::STORE, MVT::i128, MVT::v4i32);
320
321 setTruncStoreAction(MVT::i64, MVT::i1, Expand);
322 setTruncStoreAction(MVT::i64, MVT::i8, Expand);
323 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
324 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
325
326 setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
327 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Expand);
328 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand);
329 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
330
331 setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
332 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
333 setTruncStoreAction(MVT::v2f32, MVT::v2bf16, Expand);
334 setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
335 setTruncStoreAction(MVT::v3f32, MVT::v3bf16, Expand);
336 setTruncStoreAction(MVT::v3f32, MVT::v3f16, Expand);
337 setTruncStoreAction(MVT::v4f32, MVT::v4bf16, Expand);
338 setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
339 setTruncStoreAction(MVT::v8f32, MVT::v8bf16, Expand);
340 setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
341 setTruncStoreAction(MVT::v16f32, MVT::v16bf16, Expand);
342 setTruncStoreAction(MVT::v16f32, MVT::v16f16, Expand);
343 setTruncStoreAction(MVT::v32f32, MVT::v32bf16, Expand);
344 setTruncStoreAction(MVT::v32f32, MVT::v32f16, Expand);
345
346 setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
347 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
348 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
349
350 setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
351 setTruncStoreAction(MVT::v2f64, MVT::v2bf16, Expand);
352 setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
353
354 setTruncStoreAction(MVT::v3i32, MVT::v3i8, Expand);
355
356 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
357 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
358 setTruncStoreAction(MVT::v3i64, MVT::v3i8, Expand);
359 setTruncStoreAction(MVT::v3i64, MVT::v3i1, Expand);
360 setTruncStoreAction(MVT::v3f64, MVT::v3f32, Expand);
361 setTruncStoreAction(MVT::v3f64, MVT::v3bf16, Expand);
362 setTruncStoreAction(MVT::v3f64, MVT::v3f16, Expand);
363
364 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Expand);
365 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Expand);
366 setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
367 setTruncStoreAction(MVT::v4f64, MVT::v4bf16, Expand);
368 setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);
369
370 setTruncStoreAction(MVT::v5i32, MVT::v5i1, Expand);
371 setTruncStoreAction(MVT::v5i32, MVT::v5i8, Expand);
372 setTruncStoreAction(MVT::v5i32, MVT::v5i16, Expand);
373
374 setTruncStoreAction(MVT::v6i32, MVT::v6i1, Expand);
375 setTruncStoreAction(MVT::v6i32, MVT::v6i8, Expand);
376 setTruncStoreAction(MVT::v6i32, MVT::v6i16, Expand);
377
378 setTruncStoreAction(MVT::v7i32, MVT::v7i1, Expand);
379 setTruncStoreAction(MVT::v7i32, MVT::v7i8, Expand);
380 setTruncStoreAction(MVT::v7i32, MVT::v7i16, Expand);
381
382 setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);
383 setTruncStoreAction(MVT::v8f64, MVT::v8bf16, Expand);
384 setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);
385
386 setTruncStoreAction(MVT::v16f64, MVT::v16f32, Expand);
387 setTruncStoreAction(MVT::v16f64, MVT::v16bf16, Expand);
388 setTruncStoreAction(MVT::v16f64, MVT::v16f16, Expand);
389 setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
390 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
391 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
392 setTruncStoreAction(MVT::v16i64, MVT::v16i1, Expand);
393
394 setOperationAction(ISD::Constant, {MVT::i32, MVT::i64}, Legal);
395 setOperationAction(ISD::ConstantFP, {MVT::f32, MVT::f64}, Legal);
396
398
399 // For R600, this is totally unsupported, just custom lower to produce an
400 // error.
402
403 // Library functions. These default to Expand, but we have instructions
404 // for them.
407 {MVT::f16, MVT::f32}, Legal);
409
411 setOperationAction(ISD::FROUND, {MVT::f32, MVT::f64}, Custom);
413 {MVT::f16, MVT::f32, MVT::f64}, Expand);
414
417 Custom);
418
419 setOperationAction(ISD::FNEARBYINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);
420
421 setOperationAction(ISD::FRINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);
422
423 setOperationAction({ISD::LRINT, ISD::LLRINT}, {MVT::f16, MVT::f32, MVT::f64},
424 Expand);
425
426 setOperationAction(ISD::FREM, {MVT::f16, MVT::f32, MVT::f64}, Custom);
427
428 if (Subtarget->has16BitInsts()) {
429 setOperationAction(ISD::IS_FPCLASS, {MVT::f16, MVT::f32, MVT::f64}, Legal);
431 } else {
432 setOperationAction(ISD::IS_FPCLASS, {MVT::f32, MVT::f64}, Legal);
434 }
435
437 Custom);
438
439 setOperationAction(ISD::FCANONICALIZE, {MVT::f32, MVT::f64}, Legal);
440 if (Subtarget->has16BitInsts()) {
442 }
443
444 // FIXME: These IS_FPCLASS vector fp types are marked custom so it reaches
445 // scalarization code. Can be removed when IS_FPCLASS expand isn't called by
446 // default unless marked custom/legal.
448 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
449 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v16f32,
450 MVT::v2f64, MVT::v3f64, MVT::v4f64, MVT::v8f64,
451 MVT::v16f64},
452 Custom);
453
454 if (isTypeLegal(MVT::f16))
456 {MVT::v2f16, MVT::v3f16, MVT::v4f16, MVT::v16f16},
457 Custom);
458
459 // Expand to fneg + fadd.
461
463 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32,
464 MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
465 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
466 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
467 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
468 Custom);
469
472 {MVT::v2f32, MVT::v2i32, MVT::v3f32, MVT::v3i32, MVT::v4f32,
473 MVT::v4i32, MVT::v5f32, MVT::v5i32, MVT::v6f32, MVT::v6i32,
474 MVT::v7f32, MVT::v7i32, MVT::v8f32, MVT::v8i32, MVT::v9f32,
475 MVT::v9i32, MVT::v10i32, MVT::v10f32, MVT::v11i32, MVT::v11f32,
476 MVT::v12i32, MVT::v12f32, MVT::v16i32, MVT::v32f32, MVT::v32i32,
477 MVT::v2f64, MVT::v2i64, MVT::v3f64, MVT::v3i64, MVT::v4f64,
478 MVT::v4i64, MVT::v8f64, MVT::v8i64, MVT::v16f64, MVT::v16i64},
479 Custom);
480
482 setOperationAction(ISD::FP_TO_FP16, {MVT::f64, MVT::f32}, Custom);
483
484 const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
485 for (MVT VT : ScalarIntVTs) {
486 // These should use [SU]DIVREM, so set them to expand
488 Expand);
489
490 // GPU does not have divrem function for signed or unsigned.
492
493 // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
495
497
498 // AMDGPU uses ADDC/SUBC/ADDE/SUBE
500 }
501
502 // The hardware supports 32-bit FSHR, but not FSHL.
504
505 // The hardware supports 32-bit ROTR, but not ROTL.
506 setOperationAction(ISD::ROTL, {MVT::i32, MVT::i64}, Expand);
508
510
514 MVT::i64, Custom);
516
518 Legal);
519
522 MVT::i64, Custom);
523
524 for (auto VT : {MVT::i8, MVT::i16})
526
527 static const MVT::SimpleValueType VectorIntTypes[] = {
528 MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32,
529 MVT::v9i32, MVT::v10i32, MVT::v11i32, MVT::v12i32};
530
531 for (MVT VT : VectorIntTypes) {
532 // Expand the following operations for the current type by default.
545 VT, Expand);
546 }
547
548 static const MVT::SimpleValueType FloatVectorTypes[] = {
549 MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32,
550 MVT::v9f32, MVT::v10f32, MVT::v11f32, MVT::v12f32};
551
552 for (MVT VT : FloatVectorTypes) {
565 VT, Expand);
566 }
567
568 // This causes using an unrolled select operation rather than expansion with
569 // bit operations. This is in general better, but the alternative using BFI
570 // instructions may be better if the select sources are SGPRs.
572 AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32);
573
575 AddPromotedToType(ISD::SELECT, MVT::v3f32, MVT::v3i32);
576
578 AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32);
579
581 AddPromotedToType(ISD::SELECT, MVT::v5f32, MVT::v5i32);
582
584 AddPromotedToType(ISD::SELECT, MVT::v6f32, MVT::v6i32);
585
587 AddPromotedToType(ISD::SELECT, MVT::v7f32, MVT::v7i32);
588
590 AddPromotedToType(ISD::SELECT, MVT::v9f32, MVT::v9i32);
591
593 AddPromotedToType(ISD::SELECT, MVT::v10f32, MVT::v10i32);
594
596 AddPromotedToType(ISD::SELECT, MVT::v11f32, MVT::v11i32);
597
599 AddPromotedToType(ISD::SELECT, MVT::v12f32, MVT::v12i32);
600
602 setJumpIsExpensive(true);
603
606
608
609 // We want to find all load dependencies for long chains of stores to enable
610 // merging into very wide vectors. The problem is with vectors with > 4
611 // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
612 // vectors are a legal type, even though we have to split the loads
613 // usually. When we can more precisely specify load legality per address
614 // space, we should be able to make FindBetterChain/MergeConsecutiveStores
615 // smarter so that they can figure out what to do in 2 iterations without all
616 // N > 4 stores on the same chain.
618
619 // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
620 // about these during lowering.
621 MaxStoresPerMemcpy = 0xffffffff;
622 MaxStoresPerMemmove = 0xffffffff;
623 MaxStoresPerMemset = 0xffffffff;
624
625 // The expansion for 64-bit division is enormous.
627 addBypassSlowDiv(64, 32);
628
639
643}
644
646 if (getTargetMachine().Options.NoSignedZerosFPMath)
647 return true;
648
649 const auto Flags = Op.getNode()->getFlags();
650 if (Flags.hasNoSignedZeros())
651 return true;
652
653 return false;
654}
655
656//===----------------------------------------------------------------------===//
657// Target Information
658//===----------------------------------------------------------------------===//
659
661static bool fnegFoldsIntoOpcode(unsigned Opc) {
662 switch (Opc) {
663 case ISD::FADD:
664 case ISD::FSUB:
665 case ISD::FMUL:
666 case ISD::FMA:
667 case ISD::FMAD:
668 case ISD::FMINNUM:
669 case ISD::FMAXNUM:
672 case ISD::FMINIMUM:
673 case ISD::FMAXIMUM:
674 case ISD::FMINIMUMNUM:
675 case ISD::FMAXIMUMNUM:
676 case ISD::SELECT:
677 case ISD::FSIN:
678 case ISD::FTRUNC:
679 case ISD::FRINT:
680 case ISD::FNEARBYINT:
681 case ISD::FROUNDEVEN:
683 case AMDGPUISD::RCP:
690 case AMDGPUISD::FMED3:
691 // TODO: handle llvm.amdgcn.fma.legacy
692 return true;
693 case ISD::BITCAST:
694 llvm_unreachable("bitcast is special cased");
695 default:
696 return false;
697 }
698}
699
700static bool fnegFoldsIntoOp(const SDNode *N) {
701 unsigned Opc = N->getOpcode();
702 if (Opc == ISD::BITCAST) {
703 // TODO: Is there a benefit to checking the conditions performFNegCombine
704 // does? We don't for the other cases.
705 SDValue BCSrc = N->getOperand(0);
706 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
707 return BCSrc.getNumOperands() == 2 &&
708 BCSrc.getOperand(1).getValueSizeInBits() == 32;
709 }
710
711 return BCSrc.getOpcode() == ISD::SELECT && BCSrc.getValueType() == MVT::f32;
712 }
713
714 return fnegFoldsIntoOpcode(Opc);
715}
716
717/// \p returns true if the operation will definitely need to use a 64-bit
718/// encoding, and thus will use a VOP3 encoding regardless of the source
719/// modifiers.
721static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
722 return (N->getNumOperands() > 2 && N->getOpcode() != ISD::SELECT) ||
723 VT == MVT::f64;
724}
725
726/// Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the
727/// type for ISD::SELECT.
729static bool selectSupportsSourceMods(const SDNode *N) {
730 // TODO: Only applies if select will be vector
731 return N->getValueType(0) == MVT::f32;
732}
733
734// Most FP instructions support source modifiers, but this could be refined
735// slightly.
737static bool hasSourceMods(const SDNode *N) {
738 if (isa<MemSDNode>(N))
739 return false;
740
741 switch (N->getOpcode()) {
742 case ISD::CopyToReg:
743 case ISD::FDIV:
744 case ISD::FREM:
745 case ISD::INLINEASM:
749
750 // TODO: Should really be looking at the users of the bitcast. These are
751 // problematic because bitcasts are used to legalize all stores to integer
752 // types.
753 case ISD::BITCAST:
754 return false;
756 switch (N->getConstantOperandVal(0)) {
757 case Intrinsic::amdgcn_interp_p1:
758 case Intrinsic::amdgcn_interp_p2:
759 case Intrinsic::amdgcn_interp_mov:
760 case Intrinsic::amdgcn_interp_p1_f16:
761 case Intrinsic::amdgcn_interp_p2_f16:
762 return false;
763 default:
764 return true;
765 }
766 }
767 case ISD::SELECT:
769 default:
770 return true;
771 }
772}
773
775 unsigned CostThreshold) {
776 // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
777 // it is truly free to use a source modifier in all cases. If there are
778 // multiple users but for each one will necessitate using VOP3, there will be
779 // a code size increase. Try to avoid increasing code size unless we know it
780 // will save on the instruction count.
781 unsigned NumMayIncreaseSize = 0;
782 MVT VT = N->getValueType(0).getScalarType().getSimpleVT();
783
784 assert(!N->use_empty());
785
786 // XXX - Should this limit number of uses to check?
787 for (const SDNode *U : N->users()) {
788 if (!hasSourceMods(U))
789 return false;
790
791 if (!opMustUseVOP3Encoding(U, VT)) {
792 if (++NumMayIncreaseSize > CostThreshold)
793 return false;
794 }
795 }
796
797 return true;
798}
799
801 ISD::NodeType ExtendKind) const {
802 assert(!VT.isVector() && "only scalar expected");
803
804 // Round to the next multiple of 32-bits.
805 unsigned Size = VT.getSizeInBits();
806 if (Size <= 32)
807 return MVT::i32;
808 return EVT::getIntegerVT(Context, 32 * ((Size + 31) / 32));
809}
810
812 return 32;
813}
814
816 return true;
817}
818
819// The backend supports 32 and 64 bit floating point immediates.
820// FIXME: Why are we reporting vectors of FP immediates as legal?
822 bool ForCodeSize) const {
823 EVT ScalarVT = VT.getScalarType();
824 return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 ||
825 (ScalarVT == MVT::f16 && Subtarget->has16BitInsts()));
826}
827
828// We don't want to shrink f64 / f32 constants.
830 EVT ScalarVT = VT.getScalarType();
831 return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
832}
833
835 SDNode *N, ISD::LoadExtType ExtTy, EVT NewVT,
836 std::optional<unsigned> ByteOffset) const {
837 // TODO: This may be worth removing. Check regression tests for diffs.
838 if (!TargetLoweringBase::shouldReduceLoadWidth(N, ExtTy, NewVT, ByteOffset))
839 return false;
840
841 unsigned NewSize = NewVT.getStoreSizeInBits();
842
843 // If we are reducing to a 32-bit load or a smaller multi-dword load,
844 // this is always better.
845 if (NewSize >= 32)
846 return true;
847
848 EVT OldVT = N->getValueType(0);
849 unsigned OldSize = OldVT.getStoreSizeInBits();
850
851 MemSDNode *MN = cast<MemSDNode>(N);
852 unsigned AS = MN->getAddressSpace();
853 // Do not shrink an aligned scalar load to sub-dword.
854 // Scalar engine cannot do sub-dword loads.
855 // TODO: Update this for GFX12 which does have scalar sub-dword loads.
856 if (OldSize >= 32 && NewSize < 32 && MN->getAlign() >= Align(4) &&
859 (isa<LoadSDNode>(N) && AS == AMDGPUAS::GLOBAL_ADDRESS &&
860 MN->isInvariant())) &&
862 return false;
863
864 // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
865 // extloads, so doing one requires using a buffer_load. In cases where we
866 // still couldn't use a scalar load, using the wider load shouldn't really
867 // hurt anything.
868
869 // If the old size already had to be an extload, there's no harm in continuing
870 // to reduce the width.
871 return (OldSize < 32);
872}
873
875 const SelectionDAG &DAG,
876 const MachineMemOperand &MMO) const {
877
878 assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());
879
880 if (LoadTy.getScalarType() == MVT::i32)
881 return false;
882
883 unsigned LScalarSize = LoadTy.getScalarSizeInBits();
884 unsigned CastScalarSize = CastTy.getScalarSizeInBits();
885
886 if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32))
887 return false;
888
889 unsigned Fast = 0;
891 CastTy, MMO, &Fast) &&
892 Fast;
893}
894
895// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
896// profitable with the expansion for 64-bit since it's generally good to
897// speculate things.
899 return true;
900}
901
903 return true;
904}
905
907 switch (N->getOpcode()) {
908 case ISD::EntryToken:
909 case ISD::TokenFactor:
910 return true;
912 unsigned IntrID = N->getConstantOperandVal(0);
914 }
916 unsigned IntrID = N->getConstantOperandVal(1);
918 }
919 case ISD::LOAD:
920 if (cast<LoadSDNode>(N)->getMemOperand()->getAddrSpace() ==
922 return true;
923 return false;
924 case AMDGPUISD::SETCC: // ballot-style instruction
925 return true;
926 }
927 return false;
928}
929
931 SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize,
932 NegatibleCost &Cost, unsigned Depth) const {
933
934 switch (Op.getOpcode()) {
935 case ISD::FMA:
936 case ISD::FMAD: {
937 // Negating a fma is not free if it has users without source mods.
938 if (!allUsesHaveSourceMods(Op.getNode()))
939 return SDValue();
940 break;
941 }
942 case AMDGPUISD::RCP: {
943 SDValue Src = Op.getOperand(0);
944 EVT VT = Op.getValueType();
945 SDLoc SL(Op);
946
947 SDValue NegSrc = getNegatedExpression(Src, DAG, LegalOperations,
948 ForCodeSize, Cost, Depth + 1);
949 if (NegSrc)
950 return DAG.getNode(AMDGPUISD::RCP, SL, VT, NegSrc, Op->getFlags());
951 return SDValue();
952 }
953 default:
954 break;
955 }
956
957 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
958 ForCodeSize, Cost, Depth);
959}
960
961//===---------------------------------------------------------------------===//
962// Target Properties
963//===---------------------------------------------------------------------===//
964
967
968 // Packed operations do not have a fabs modifier.
969 return VT == MVT::f32 || VT == MVT::f64 ||
970 (Subtarget->has16BitInsts() && (VT == MVT::f16 || VT == MVT::bf16));
971}
972
975 // Report this based on the end legalized type.
976 VT = VT.getScalarType();
977 return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16 || VT == MVT::bf16;
978}
979
981 unsigned NumElem,
982 unsigned AS) const {
983 return true;
984}
985
987 // There are few operations which truly have vector input operands. Any vector
988 // operation is going to involve operations on each component, and a
989 // build_vector will be a copy per element, so it always makes sense to use a
990 // build_vector input in place of the extracted element to avoid a copy into a
991 // super register.
992 //
993 // We should probably only do this if all users are extracts only, but this
994 // should be the common case.
995 return true;
996}
997
999 // Truncate is just accessing a subregister.
1000
1001 unsigned SrcSize = Source.getSizeInBits();
1002 unsigned DestSize = Dest.getSizeInBits();
1003
1004 return DestSize < SrcSize && DestSize % 32 == 0 ;
1005}
1006
1008 // Truncate is just accessing a subregister.
1009
1010 unsigned SrcSize = Source->getScalarSizeInBits();
1011 unsigned DestSize = Dest->getScalarSizeInBits();
1012
1013 if (DestSize== 16 && Subtarget->has16BitInsts())
1014 return SrcSize >= 32;
1015
1016 return DestSize < SrcSize && DestSize % 32 == 0;
1017}
1018
1020 unsigned SrcSize = Src->getScalarSizeInBits();
1021 unsigned DestSize = Dest->getScalarSizeInBits();
1022
1023 if (SrcSize == 16 && Subtarget->has16BitInsts())
1024 return DestSize >= 32;
1025
1026 return SrcSize == 32 && DestSize == 64;
1027}
1028
1030 // Any register load of a 64-bit value really requires 2 32-bit moves. For all
1031 // practical purposes, the extra mov 0 to load a 64-bit is free. As used,
1032 // this will enable reducing 64-bit operations the 32-bit, which is always
1033 // good.
1034
1035 if (Src == MVT::i16)
1036 return Dest == MVT::i32 ||Dest == MVT::i64 ;
1037
1038 return Src == MVT::i32 && Dest == MVT::i64;
1039}
1040
1042 EVT DestVT) const {
1043 switch (N->getOpcode()) {
1044 case ISD::ADD:
1045 case ISD::SUB:
1046 case ISD::SHL:
1047 case ISD::SRL:
1048 case ISD::SRA:
1049 case ISD::AND:
1050 case ISD::OR:
1051 case ISD::XOR:
1052 case ISD::MUL:
1053 case ISD::SETCC:
1054 case ISD::SELECT:
1055 case ISD::SMIN:
1056 case ISD::SMAX:
1057 case ISD::UMIN:
1058 case ISD::UMAX:
1059 if (Subtarget->has16BitInsts() &&
1060 (!DestVT.isVector() || !Subtarget->hasVOP3PInsts())) {
1061 // Don't narrow back down to i16 if promoted to i32 already.
1062 if (!N->isDivergent() && DestVT.isInteger() &&
1063 DestVT.getScalarSizeInBits() > 1 &&
1064 DestVT.getScalarSizeInBits() <= 16 &&
1065 SrcVT.getScalarSizeInBits() > 16) {
1066 return false;
1067 }
1068 }
1069 return true;
1070 default:
1071 break;
1072 }
1073
1074 // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
1075 // limited number of native 64-bit operations. Shrinking an operation to fit
1076 // in a single 32-bit register should always be helpful. As currently used,
1077 // this is much less general than the name suggests, and is only used in
1078 // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
1079 // not profitable, and may actually be harmful.
1080 if (isa<LoadSDNode>(N))
1081 return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
1082
1083 return true;
1084}
1085
1087 const SDNode* N, CombineLevel Level) const {
1088 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
1089 N->getOpcode() == ISD::SRL) &&
1090 "Expected shift op");
1091
1092 SDValue ShiftLHS = N->getOperand(0);
1093 if (!ShiftLHS->hasOneUse())
1094 return false;
1095
1096 if (ShiftLHS.getOpcode() == ISD::SIGN_EXTEND &&
1097 !ShiftLHS.getOperand(0)->hasOneUse())
1098 return false;
1099
1100 // Always commute pre-type legalization and right shifts.
1101 // We're looking for shl(or(x,y),z) patterns.
1103 N->getOpcode() != ISD::SHL || N->getOperand(0).getOpcode() != ISD::OR)
1104 return true;
1105
1106 // If only user is a i32 right-shift, then don't destroy a BFE pattern.
1107 if (N->getValueType(0) == MVT::i32 && N->hasOneUse() &&
1108 (N->user_begin()->getOpcode() == ISD::SRA ||
1109 N->user_begin()->getOpcode() == ISD::SRL))
1110 return false;
1111
1112 // Don't destroy or(shl(load_zext(),c), load_zext()) patterns.
1113 auto IsShiftAndLoad = [](SDValue LHS, SDValue RHS) {
1114 if (LHS.getOpcode() != ISD::SHL)
1115 return false;
1116 auto *RHSLd = dyn_cast<LoadSDNode>(RHS);
1117 auto *LHS0 = dyn_cast<LoadSDNode>(LHS.getOperand(0));
1118 auto *LHS1 = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
1119 return LHS0 && LHS1 && RHSLd && LHS0->getExtensionType() == ISD::ZEXTLOAD &&
1120 LHS1->getAPIntValue() == LHS0->getMemoryVT().getScalarSizeInBits() &&
1121 RHSLd->getExtensionType() == ISD::ZEXTLOAD;
1122 };
1123 SDValue LHS = N->getOperand(0).getOperand(0);
1124 SDValue RHS = N->getOperand(0).getOperand(1);
1125 return !(IsShiftAndLoad(LHS, RHS) || IsShiftAndLoad(RHS, LHS));
1126}
1127
1128//===---------------------------------------------------------------------===//
1129// TargetLowering Callbacks
1130//===---------------------------------------------------------------------===//
1131
1133 bool IsVarArg) {
1134 switch (CC) {
1142 return CC_AMDGPU;
1145 return CC_AMDGPU_CS_CHAIN;
1146 case CallingConv::C:
1147 case CallingConv::Fast:
1148 case CallingConv::Cold:
1149 return CC_AMDGPU_Func;
1152 return CC_SI_Gfx;
1155 default:
1156 reportFatalUsageError("unsupported calling convention for call");
1157 }
1158}
1159
1161 bool IsVarArg) {
1162 switch (CC) {
1165 llvm_unreachable("kernels should not be handled here");
1175 return RetCC_SI_Shader;
1178 return RetCC_SI_Gfx;
1179 case CallingConv::C:
1180 case CallingConv::Fast:
1181 case CallingConv::Cold:
1182 return RetCC_AMDGPU_Func;
1183 default:
1184 reportFatalUsageError("unsupported calling convention");
1185 }
1186}
1187
1188/// The SelectionDAGBuilder will automatically promote function arguments
1189/// with illegal types. However, this does not work for the AMDGPU targets
1190/// since the function arguments are stored in memory as these illegal types.
1191/// In order to handle this properly we need to get the original types sizes
1192/// from the LLVM IR Function and fixup the ISD:InputArg values before
1193/// passing them to AnalyzeFormalArguments()
1194
1195/// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
1196/// input values across multiple registers. Each item in the Ins array
1197/// represents a single value that will be stored in registers. Ins[x].VT is
1198/// the value type of the value that will be stored in the register, so
1199/// whatever SDNode we lower the argument to needs to be this type.
1200///
1201/// In order to correctly lower the arguments we need to know the size of each
1202/// argument. Since Ins[x].VT gives us the size of the register that will
1203/// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
1204/// for the original function argument so that we can deduce the correct memory
1205/// type to use for Ins[x]. In most cases the correct memory type will be
1206/// Ins[x].ArgVT. However, this will not always be the case. If, for example,
1207/// we have a kernel argument of type v8i8, this argument will be split into
1208/// 8 parts and each part will be represented by its own item in the Ins array.
1209/// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
1210/// the argument before it was split. From this, we deduce that the memory type
1211/// for each individual part is i8. We pass the memory type as LocVT to the
1212/// calling convention analysis function and the register type (Ins[x].VT) as
1213/// the ValVT.
1215 CCState &State,
1216 const SmallVectorImpl<ISD::InputArg> &Ins) const {
1217 const MachineFunction &MF = State.getMachineFunction();
1218 const Function &Fn = MF.getFunction();
1219 LLVMContext &Ctx = Fn.getParent()->getContext();
1220 const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF);
1221 const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset();
1223
1224 Align MaxAlign = Align(1);
1225 uint64_t ExplicitArgOffset = 0;
1226 const DataLayout &DL = Fn.getDataLayout();
1227
1228 unsigned InIndex = 0;
1229
1230 for (const Argument &Arg : Fn.args()) {
1231 const bool IsByRef = Arg.hasByRefAttr();
1232 Type *BaseArgTy = Arg.getType();
1233 Type *MemArgTy = IsByRef ? Arg.getParamByRefType() : BaseArgTy;
1234 Align Alignment = DL.getValueOrABITypeAlignment(
1235 IsByRef ? Arg.getParamAlign() : std::nullopt, MemArgTy);
1236 MaxAlign = std::max(Alignment, MaxAlign);
1237 uint64_t AllocSize = DL.getTypeAllocSize(MemArgTy);
1238
1239 uint64_t ArgOffset = alignTo(ExplicitArgOffset, Alignment) + ExplicitOffset;
1240 ExplicitArgOffset = alignTo(ExplicitArgOffset, Alignment) + AllocSize;
1241
1242 // We're basically throwing away everything passed into us and starting over
1243 // to get accurate in-memory offsets. The "PartOffset" is completely useless
1244 // to us as computed in Ins.
1245 //
1246 // We also need to figure out what type legalization is trying to do to get
1247 // the correct memory offsets.
1248
1249 SmallVector<EVT, 16> ValueVTs;
1251 ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, &Offsets, ArgOffset);
1252
1253 for (unsigned Value = 0, NumValues = ValueVTs.size();
1254 Value != NumValues; ++Value) {
1255 uint64_t BasePartOffset = Offsets[Value];
1256
1257 EVT ArgVT = ValueVTs[Value];
1258 EVT MemVT = ArgVT;
1259 MVT RegisterVT = getRegisterTypeForCallingConv(Ctx, CC, ArgVT);
1260 unsigned NumRegs = getNumRegistersForCallingConv(Ctx, CC, ArgVT);
1261
1262 if (NumRegs == 1) {
1263 // This argument is not split, so the IR type is the memory type.
1264 if (ArgVT.isExtended()) {
1265 // We have an extended type, like i24, so we should just use the
1266 // register type.
1267 MemVT = RegisterVT;
1268 } else {
1269 MemVT = ArgVT;
1270 }
1271 } else if (ArgVT.isVector() && RegisterVT.isVector() &&
1272 ArgVT.getScalarType() == RegisterVT.getScalarType()) {
1273 assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements());
1274 // We have a vector value which has been split into a vector with
1275 // the same scalar type, but fewer elements. This should handle
1276 // all the floating-point vector types.
1277 MemVT = RegisterVT;
1278 } else if (ArgVT.isVector() &&
1279 ArgVT.getVectorNumElements() == NumRegs) {
1280 // This arg has been split so that each element is stored in a separate
1281 // register.
1282 MemVT = ArgVT.getScalarType();
1283 } else if (ArgVT.isExtended()) {
1284 // We have an extended type, like i65.
1285 MemVT = RegisterVT;
1286 } else {
1287 unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs;
1288 assert(ArgVT.getStoreSizeInBits() % NumRegs == 0);
1289 if (RegisterVT.isInteger()) {
1290 MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
1291 } else if (RegisterVT.isVector()) {
1292 assert(!RegisterVT.getScalarType().isFloatingPoint());
1293 unsigned NumElements = RegisterVT.getVectorNumElements();
1294 assert(MemoryBits % NumElements == 0);
1295 // This vector type has been split into another vector type with
1296 // a different elements size.
1297 EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
1298 MemoryBits / NumElements);
1299 MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
1300 } else {
1301 llvm_unreachable("cannot deduce memory type.");
1302 }
1303 }
1304
1305 // Convert one element vectors to scalar.
1306 if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
1307 MemVT = MemVT.getScalarType();
1308
1309 // Round up vec3/vec5 argument.
1310 if (MemVT.isVector() && !MemVT.isPow2VectorType()) {
1311 MemVT = MemVT.getPow2VectorType(State.getContext());
1312 } else if (!MemVT.isSimple() && !MemVT.isVector()) {
1313 MemVT = MemVT.getRoundIntegerType(State.getContext());
1314 }
1315
1316 unsigned PartOffset = 0;
1317 for (unsigned i = 0; i != NumRegs; ++i) {
1318 State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT,
1319 BasePartOffset + PartOffset,
1320 MemVT.getSimpleVT(),
1322 PartOffset += MemVT.getStoreSize();
1323 }
1324 }
1325 }
1326}
1327
1329 SDValue Chain, CallingConv::ID CallConv,
1330 bool isVarArg,
1332 const SmallVectorImpl<SDValue> &OutVals,
1333 const SDLoc &DL, SelectionDAG &DAG) const {
1334 // FIXME: Fails for r600 tests
1335 //assert(!isVarArg && Outs.empty() && OutVals.empty() &&
1336 // "wave terminate should not have return values");
1337 return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
1338}
1339
1340//===---------------------------------------------------------------------===//
1341// Target specific lowering
1342//===---------------------------------------------------------------------===//
1343
1344/// Selects the correct CCAssignFn for a given CallingConvention value.
1346 bool IsVarArg) {
1347 return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg);
1348}
1349
1351 bool IsVarArg) {
1352 return AMDGPUCallLowering::CCAssignFnForReturn(CC, IsVarArg);
1353}
1354
1356 SelectionDAG &DAG,
1357 MachineFrameInfo &MFI,
1358 int ClobberedFI) const {
1359 SmallVector<SDValue, 8> ArgChains;
1360 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
1361 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
1362
1363 // Include the original chain at the beginning of the list. When this is
1364 // used by target LowerCall hooks, this helps legalize find the
1365 // CALLSEQ_BEGIN node.
1366 ArgChains.push_back(Chain);
1367
1368 // Add a chain value for each stack argument corresponding
1369 for (SDNode *U : DAG.getEntryNode().getNode()->users()) {
1370 if (LoadSDNode *L = dyn_cast<LoadSDNode>(U)) {
1371 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) {
1372 if (FI->getIndex() < 0) {
1373 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
1374 int64_t InLastByte = InFirstByte;
1375 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
1376
1377 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
1378 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
1379 ArgChains.push_back(SDValue(L, 1));
1380 }
1381 }
1382 }
1383 }
1384
1385 // Build a tokenfactor for all the chains.
1386 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
1387}
1388
1391 StringRef Reason) const {
1392 SDValue Callee = CLI.Callee;
1393 SelectionDAG &DAG = CLI.DAG;
1394
1395 const Function &Fn = DAG.getMachineFunction().getFunction();
1396
1397 StringRef FuncName("<unknown>");
1398
1399 if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee))
1400 FuncName = G->getSymbol();
1401 else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
1402 FuncName = G->getGlobal()->getName();
1403
1404 DAG.getContext()->diagnose(
1405 DiagnosticInfoUnsupported(Fn, Reason + FuncName, CLI.DL.getDebugLoc()));
1406
1407 if (!CLI.IsTailCall) {
1408 for (ISD::InputArg &Arg : CLI.Ins)
1409 InVals.push_back(DAG.getPOISON(Arg.VT));
1410 }
1411
1412 return DAG.getEntryNode();
1413}
1414
1416 SmallVectorImpl<SDValue> &InVals) const {
1417 return lowerUnhandledCall(CLI, InVals, "unsupported call to function ");
1418}
1419
1421 SelectionDAG &DAG) const {
1422 const Function &Fn = DAG.getMachineFunction().getFunction();
1423
1425 Fn, "unsupported dynamic alloca", SDLoc(Op).getDebugLoc()));
1426 auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
1427 return DAG.getMergeValues(Ops, SDLoc());
1428}
1429
1431 SelectionDAG &DAG) const {
1432 switch (Op.getOpcode()) {
1433 default:
1434 Op->print(errs(), &DAG);
1435 llvm_unreachable("Custom lowering code for this "
1436 "instruction is not implemented yet!");
1437 break;
1439 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
1441 case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
1442 case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
1443 case ISD::FREM: return LowerFREM(Op, DAG);
1444 case ISD::FCEIL: return LowerFCEIL(Op, DAG);
1445 case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
1446 case ISD::FRINT: return LowerFRINT(Op, DAG);
1447 case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
1448 case ISD::FROUNDEVEN:
1449 return LowerFROUNDEVEN(Op, DAG);
1450 case ISD::FROUND: return LowerFROUND(Op, DAG);
1451 case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
1452 case ISD::FLOG2:
1453 return LowerFLOG2(Op, DAG);
1454 case ISD::FLOG:
1455 case ISD::FLOG10:
1456 return LowerFLOGCommon(Op, DAG);
1457 case ISD::FEXP:
1458 case ISD::FEXP10:
1459 return lowerFEXP(Op, DAG);
1460 case ISD::FEXP2:
1461 return lowerFEXP2(Op, DAG);
1462 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
1463 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
1464 case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
1465 case ISD::FP_TO_SINT:
1466 case ISD::FP_TO_UINT:
1467 return LowerFP_TO_INT(Op, DAG);
1468 case ISD::CTTZ:
1470 case ISD::CTLZ:
1472 return LowerCTLZ_CTTZ(Op, DAG);
1474 }
1475 return Op;
1476}
1477
1480 SelectionDAG &DAG) const {
1481 switch (N->getOpcode()) {
1483 // Different parts of legalization seem to interpret which type of
1484 // sign_extend_inreg is the one to check for custom lowering. The extended
1485 // from type is what really matters, but some places check for custom
1486 // lowering of the result type. This results in trying to use
1487 // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
1488 // nothing here and let the illegal result integer be handled normally.
1489 return;
1490 case ISD::FLOG2:
1491 if (SDValue Lowered = LowerFLOG2(SDValue(N, 0), DAG))
1492 Results.push_back(Lowered);
1493 return;
1494 case ISD::FLOG:
1495 case ISD::FLOG10:
1496 if (SDValue Lowered = LowerFLOGCommon(SDValue(N, 0), DAG))
1497 Results.push_back(Lowered);
1498 return;
1499 case ISD::FEXP2:
1500 if (SDValue Lowered = lowerFEXP2(SDValue(N, 0), DAG))
1501 Results.push_back(Lowered);
1502 return;
1503 case ISD::FEXP:
1504 case ISD::FEXP10:
1505 if (SDValue Lowered = lowerFEXP(SDValue(N, 0), DAG))
1506 Results.push_back(Lowered);
1507 return;
1508 case ISD::CTLZ:
1510 if (auto Lowered = lowerCTLZResults(SDValue(N, 0u), DAG))
1511 Results.push_back(Lowered);
1512 return;
1513 default:
1514 return;
1515 }
1516}
1517
1519 SDValue Op,
1520 SelectionDAG &DAG) const {
1521
1522 const DataLayout &DL = DAG.getDataLayout();
1523 GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
1524 const GlobalValue *GV = G->getGlobal();
1525
1526 if (!MFI->isModuleEntryFunction()) {
1527 auto IsNamedBarrier = AMDGPU::isNamedBarrier(*cast<GlobalVariable>(GV));
1528 if (std::optional<uint32_t> Address =
1530 if (IsNamedBarrier) {
1531 unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
1532 MFI->recordNumNamedBarriers(Address.value(), BarCnt);
1533 }
1534 return DAG.getConstant(*Address, SDLoc(Op), Op.getValueType());
1535 } else if (IsNamedBarrier) {
1536 llvm_unreachable("named barrier should have an assigned address");
1537 }
1538 }
1539
1540 if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
1541 G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
1542 if (!MFI->isModuleEntryFunction() &&
1543 GV->getName() != "llvm.amdgcn.module.lds" &&
1544 !AMDGPU::isNamedBarrier(*cast<GlobalVariable>(GV))) {
1545 SDLoc DL(Op);
1546 const Function &Fn = DAG.getMachineFunction().getFunction();
1548 Fn, "local memory global used by non-kernel function",
1549 DL.getDebugLoc(), DS_Warning));
1550
1551 // We currently don't have a way to correctly allocate LDS objects that
1552 // aren't directly associated with a kernel. We do force inlining of
1553 // functions that use local objects. However, if these dead functions are
1554 // not eliminated, we don't want a compile time error. Just emit a warning
1555 // and a trap, since there should be no callable path here.
1556 SDValue Trap = DAG.getNode(ISD::TRAP, DL, MVT::Other, DAG.getEntryNode());
1557 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
1558 Trap, DAG.getRoot());
1559 DAG.setRoot(OutputChain);
1560 return DAG.getPOISON(Op.getValueType());
1561 }
1562
1563 // XXX: What does the value of G->getOffset() mean?
1564 assert(G->getOffset() == 0 &&
1565 "Do not know what to do with an non-zero offset");
1566
1567 // TODO: We could emit code to handle the initialization somewhere.
1568 // We ignore the initializer for now and legalize it to allow selection.
1569 // The initializer will anyway get errored out during assembly emission.
1570 unsigned Offset = MFI->allocateLDSGlobal(DL, *cast<GlobalVariable>(GV));
1571 return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
1572 }
1573 return SDValue();
1574}
1575
1577 SelectionDAG &DAG) const {
1579 SDLoc SL(Op);
1580
1581 EVT VT = Op.getValueType();
1582 if (VT.getVectorElementType().getSizeInBits() < 32) {
1583 unsigned OpBitSize = Op.getOperand(0).getValueType().getSizeInBits();
1584 if (OpBitSize >= 32 && OpBitSize % 32 == 0) {
1585 unsigned NewNumElt = OpBitSize / 32;
1586 EVT NewEltVT = (NewNumElt == 1) ? MVT::i32
1588 MVT::i32, NewNumElt);
1589 for (const SDUse &U : Op->ops()) {
1590 SDValue In = U.get();
1591 SDValue NewIn = DAG.getNode(ISD::BITCAST, SL, NewEltVT, In);
1592 if (NewNumElt > 1)
1593 DAG.ExtractVectorElements(NewIn, Args);
1594 else
1595 Args.push_back(NewIn);
1596 }
1597
1598 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
1599 NewNumElt * Op.getNumOperands());
1600 SDValue BV = DAG.getBuildVector(NewVT, SL, Args);
1601 return DAG.getNode(ISD::BITCAST, SL, VT, BV);
1602 }
1603 }
1604
1605 for (const SDUse &U : Op->ops())
1606 DAG.ExtractVectorElements(U.get(), Args);
1607
1608 return DAG.getBuildVector(Op.getValueType(), SL, Args);
1609}
1610
1612 SelectionDAG &DAG) const {
1613 SDLoc SL(Op);
1615 unsigned Start = Op.getConstantOperandVal(1);
1616 EVT VT = Op.getValueType();
1617 EVT SrcVT = Op.getOperand(0).getValueType();
1618
1619 if (VT.getScalarSizeInBits() == 16 && Start % 2 == 0) {
1620 unsigned NumElt = VT.getVectorNumElements();
1621 unsigned NumSrcElt = SrcVT.getVectorNumElements();
1622 assert(NumElt % 2 == 0 && NumSrcElt % 2 == 0 && "expect legal types");
1623
1624 // Extract 32-bit registers at a time.
1625 EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumSrcElt / 2);
1626 EVT NewVT = NumElt == 2
1627 ? MVT::i32
1628 : EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElt / 2);
1629 SDValue Tmp = DAG.getNode(ISD::BITCAST, SL, NewSrcVT, Op.getOperand(0));
1630
1631 DAG.ExtractVectorElements(Tmp, Args, Start / 2, NumElt / 2);
1632 if (NumElt == 2)
1633 Tmp = Args[0];
1634 else
1635 Tmp = DAG.getBuildVector(NewVT, SL, Args);
1636
1637 return DAG.getNode(ISD::BITCAST, SL, VT, Tmp);
1638 }
1639
1640 DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
1642
1643 return DAG.getBuildVector(Op.getValueType(), SL, Args);
1644}
1645
1646// TODO: Handle fabs too
1648 if (Val.getOpcode() == ISD::FNEG)
1649 return Val.getOperand(0);
1650
1651 return Val;
1652}
1653
1655 if (Val.getOpcode() == ISD::FNEG)
1656 Val = Val.getOperand(0);
1657 if (Val.getOpcode() == ISD::FABS)
1658 Val = Val.getOperand(0);
1659 if (Val.getOpcode() == ISD::FCOPYSIGN)
1660 Val = Val.getOperand(0);
1661 return Val;
1662}
1663
1665 const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True,
1666 SDValue False, SDValue CC, DAGCombinerInfo &DCI) const {
1667 SelectionDAG &DAG = DCI.DAG;
1668 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1669 switch (CCOpcode) {
1670 case ISD::SETOEQ:
1671 case ISD::SETONE:
1672 case ISD::SETUNE:
1673 case ISD::SETNE:
1674 case ISD::SETUEQ:
1675 case ISD::SETEQ:
1676 case ISD::SETFALSE:
1677 case ISD::SETFALSE2:
1678 case ISD::SETTRUE:
1679 case ISD::SETTRUE2:
1680 case ISD::SETUO:
1681 case ISD::SETO:
1682 break;
1683 case ISD::SETULE:
1684 case ISD::SETULT: {
1685 if (LHS == True)
1686 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1687 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1688 }
1689 case ISD::SETOLE:
1690 case ISD::SETOLT:
1691 case ISD::SETLE:
1692 case ISD::SETLT: {
1693 // Ordered. Assume ordered for undefined.
1694
1695 // Only do this after legalization to avoid interfering with other combines
1696 // which might occur.
1698 !DCI.isCalledByLegalizer())
1699 return SDValue();
1700
1701 // We need to permute the operands to get the correct NaN behavior. The
1702 // selected operand is the second one based on the failing compare with NaN,
1703 // so permute it based on the compare type the hardware uses.
1704 if (LHS == True)
1705 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1706 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1707 }
1708 case ISD::SETUGE:
1709 case ISD::SETUGT: {
1710 if (LHS == True)
1711 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1712 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1713 }
1714 case ISD::SETGT:
1715 case ISD::SETGE:
1716 case ISD::SETOGE:
1717 case ISD::SETOGT: {
1719 !DCI.isCalledByLegalizer())
1720 return SDValue();
1721
1722 if (LHS == True)
1723 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1724 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1725 }
1726 case ISD::SETCC_INVALID:
1727 llvm_unreachable("Invalid setcc condcode!");
1728 }
1729 return SDValue();
1730}
1731
1732/// Generate Min/Max node
1734 SDValue LHS, SDValue RHS,
1735 SDValue True, SDValue False,
1736 SDValue CC,
1737 DAGCombinerInfo &DCI) const {
1738 if ((LHS == True && RHS == False) || (LHS == False && RHS == True))
1739 return combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, True, False, CC, DCI);
1740
1741 SelectionDAG &DAG = DCI.DAG;
1742
1743 // If we can't directly match this, try to see if we can fold an fneg to
1744 // match.
1745
1746 ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
1747 ConstantFPSDNode *CFalse = dyn_cast<ConstantFPSDNode>(False);
1748 SDValue NegTrue = peekFNeg(True);
1749
1750 // Undo the combine foldFreeOpFromSelect does if it helps us match the
1751 // fmin/fmax.
1752 //
1753 // select (fcmp olt (lhs, K)), (fneg lhs), -K
1754 // -> fneg (fmin_legacy lhs, K)
1755 //
1756 // TODO: Use getNegatedExpression
1757 if (LHS == NegTrue && CFalse && CRHS) {
1758 APFloat NegRHS = neg(CRHS->getValueAPF());
1759 if (NegRHS == CFalse->getValueAPF()) {
1760 SDValue Combined =
1761 combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, NegTrue, False, CC, DCI);
1762 if (Combined)
1763 return DAG.getNode(ISD::FNEG, DL, VT, Combined);
1764 return SDValue();
1765 }
1766 }
1767
1768 return SDValue();
1769}
1770
1771std::pair<SDValue, SDValue>
1773 SDLoc SL(Op);
1774
1775 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1776
1777 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1778 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1779
1780 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1781 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1782
1783 return std::pair(Lo, Hi);
1784}
1785
1787 SDLoc SL(Op);
1788
1789 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1790 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1791 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1792}
1793
1795 SDLoc SL(Op);
1796
1797 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1798 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1799 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1800}
1801
1802// Split a vector type into two parts. The first part is a power of two vector.
1803// The second part is whatever is left over, and is a scalar if it would
1804// otherwise be a 1-vector.
1805std::pair<EVT, EVT>
1807 EVT LoVT, HiVT;
1808 EVT EltVT = VT.getVectorElementType();
1809 unsigned NumElts = VT.getVectorNumElements();
1810 unsigned LoNumElts = PowerOf2Ceil((NumElts + 1) / 2);
1811 LoVT = EVT::getVectorVT(*DAG.getContext(), EltVT, LoNumElts);
1812 HiVT = NumElts - LoNumElts == 1
1813 ? EltVT
1814 : EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts - LoNumElts);
1815 return std::pair(LoVT, HiVT);
1816}
1817
1818// Split a vector value into two parts of types LoVT and HiVT. HiVT could be
1819// scalar.
1820std::pair<SDValue, SDValue>
1822 const EVT &LoVT, const EVT &HiVT,
1823 SelectionDAG &DAG) const {
1824 EVT VT = N.getValueType();
1826 (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <=
1827 VT.getVectorNumElements() &&
1828 "More vector elements requested than available!");
1830 DAG.getVectorIdxConstant(0, DL));
1831
1832 unsigned LoNumElts = LoVT.getVectorNumElements();
1833
1834 if (HiVT.isVector()) {
1835 unsigned HiNumElts = HiVT.getVectorNumElements();
1836 if ((VT.getVectorNumElements() % HiNumElts) == 0) {
1837 // Avoid creating an extract_subvector with an index that isn't a multiple
1838 // of the result type.
1840 DAG.getConstant(LoNumElts, DL, MVT::i32));
1841 return {Lo, Hi};
1842 }
1843
1845 DAG.ExtractVectorElements(N, Elts, /*Start=*/LoNumElts,
1846 /*Count=*/HiNumElts);
1847 SDValue Hi = DAG.getBuildVector(HiVT, DL, Elts);
1848 return {Lo, Hi};
1849 }
1850
1852 DAG.getVectorIdxConstant(LoNumElts, DL));
1853 return {Lo, Hi};
1854}
1855
1857 SelectionDAG &DAG) const {
1858 LoadSDNode *Load = cast<LoadSDNode>(Op);
1859 EVT VT = Op.getValueType();
1860 SDLoc SL(Op);
1861
1862
1863 // If this is a 2 element vector, we really want to scalarize and not create
1864 // weird 1 element vectors.
1865 if (VT.getVectorNumElements() == 2) {
1866 SDValue Ops[2];
1867 std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG);
1868 return DAG.getMergeValues(Ops, SL);
1869 }
1870
1871 SDValue BasePtr = Load->getBasePtr();
1872 EVT MemVT = Load->getMemoryVT();
1873
1874 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1875
1876 EVT LoVT, HiVT;
1877 EVT LoMemVT, HiMemVT;
1878 SDValue Lo, Hi;
1879
1880 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1881 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1882 std::tie(Lo, Hi) = splitVector(Op, SL, LoVT, HiVT, DAG);
1883
1884 unsigned Size = LoMemVT.getStoreSize();
1885 Align BaseAlign = Load->getAlign();
1886 Align HiAlign = commonAlignment(BaseAlign, Size);
1887
1888 SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
1889 Load->getChain(), BasePtr, SrcValue, LoMemVT,
1890 BaseAlign, Load->getMemOperand()->getFlags());
1891 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Size));
1892 SDValue HiLoad =
1893 DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(),
1894 HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()),
1895 HiMemVT, HiAlign, Load->getMemOperand()->getFlags());
1896
1897 SDValue Join;
1898 if (LoVT == HiVT) {
1899 // This is the case that the vector is power of two so was evenly split.
1900 Join = DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad);
1901 } else {
1902 Join = DAG.getNode(ISD::INSERT_SUBVECTOR, SL, VT, DAG.getPOISON(VT), LoLoad,
1903 DAG.getVectorIdxConstant(0, SL));
1904 Join = DAG.getNode(
1906 VT, Join, HiLoad,
1908 }
1909
1910 SDValue Ops[] = {Join, DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
1911 LoLoad.getValue(1), HiLoad.getValue(1))};
1912
1913 return DAG.getMergeValues(Ops, SL);
1914}
1915
1917 SelectionDAG &DAG) const {
1918 LoadSDNode *Load = cast<LoadSDNode>(Op);
1919 EVT VT = Op.getValueType();
1920 SDValue BasePtr = Load->getBasePtr();
1921 EVT MemVT = Load->getMemoryVT();
1922 SDLoc SL(Op);
1923 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1924 Align BaseAlign = Load->getAlign();
1925 unsigned NumElements = MemVT.getVectorNumElements();
1926
1927 // Widen from vec3 to vec4 when the load is at least 8-byte aligned
1928 // or 16-byte fully dereferenceable. Otherwise, split the vector load.
1929 if (NumElements != 3 ||
1930 (BaseAlign < Align(8) &&
1931 !SrcValue.isDereferenceable(16, *DAG.getContext(), DAG.getDataLayout())))
1932 return SplitVectorLoad(Op, DAG);
1933
1934 assert(NumElements == 3);
1935
1936 EVT WideVT =
1938 EVT WideMemVT =
1940 SDValue WideLoad = DAG.getExtLoad(
1941 Load->getExtensionType(), SL, WideVT, Load->getChain(), BasePtr, SrcValue,
1942 WideMemVT, BaseAlign, Load->getMemOperand()->getFlags());
1943 return DAG.getMergeValues(
1944 {DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, VT, WideLoad,
1945 DAG.getVectorIdxConstant(0, SL)),
1946 WideLoad.getValue(1)},
1947 SL);
1948}
1949
1951 SelectionDAG &DAG) const {
1952 StoreSDNode *Store = cast<StoreSDNode>(Op);
1953 SDValue Val = Store->getValue();
1954 EVT VT = Val.getValueType();
1955
1956 // If this is a 2 element vector, we really want to scalarize and not create
1957 // weird 1 element vectors.
1958 if (VT.getVectorNumElements() == 2)
1959 return scalarizeVectorStore(Store, DAG);
1960
1961 EVT MemVT = Store->getMemoryVT();
1962 SDValue Chain = Store->getChain();
1963 SDValue BasePtr = Store->getBasePtr();
1964 SDLoc SL(Op);
1965
1966 EVT LoVT, HiVT;
1967 EVT LoMemVT, HiMemVT;
1968 SDValue Lo, Hi;
1969
1970 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1971 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1972 std::tie(Lo, Hi) = splitVector(Val, SL, LoVT, HiVT, DAG);
1973
1974 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize());
1975
1976 const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
1977 Align BaseAlign = Store->getAlign();
1978 unsigned Size = LoMemVT.getStoreSize();
1979 Align HiAlign = commonAlignment(BaseAlign, Size);
1980
1981 SDValue LoStore =
1982 DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
1983 Store->getMemOperand()->getFlags());
1984 SDValue HiStore =
1985 DAG.getTruncStore(Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size),
1986 HiMemVT, HiAlign, Store->getMemOperand()->getFlags());
1987
1988 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
1989}
1990
1991// This is a shortcut for integer division because we have fast i32<->f32
1992// conversions, and fast f32 reciprocal instructions. The fractional part of a
1993// float is enough to accurately represent up to a 24-bit signed integer.
1995 bool Sign) const {
1996 SDLoc DL(Op);
1997 EVT VT = Op.getValueType();
1998 SDValue LHS = Op.getOperand(0);
1999 SDValue RHS = Op.getOperand(1);
2000 MVT IntVT = MVT::i32;
2001 MVT FltVT = MVT::f32;
2002
2003 unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS);
2004 if (LHSSignBits < 9)
2005 return SDValue();
2006
2007 unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS);
2008 if (RHSSignBits < 9)
2009 return SDValue();
2010
2011 unsigned BitSize = VT.getSizeInBits();
2012 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
2013 unsigned DivBits = BitSize - SignBits;
2014 if (Sign)
2015 ++DivBits;
2016
2019
2020 SDValue jq = DAG.getConstant(1, DL, IntVT);
2021
2022 if (Sign) {
2023 // char|short jq = ia ^ ib;
2024 jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
2025
2026 // jq = jq >> (bitsize - 2)
2027 jq = DAG.getNode(ISD::SRA, DL, VT, jq,
2028 DAG.getConstant(BitSize - 2, DL, VT));
2029
2030 // jq = jq | 0x1
2031 jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));
2032 }
2033
2034 // int ia = (int)LHS;
2035 SDValue ia = LHS;
2036
2037 // int ib, (int)RHS;
2038 SDValue ib = RHS;
2039
2040 // float fa = (float)ia;
2041 SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
2042
2043 // float fb = (float)ib;
2044 SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
2045
2046 SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
2047 fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
2048
2049 // fq = trunc(fq);
2050 fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);
2051
2052 // float fqneg = -fq;
2053 SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
2054
2056
2057 bool UseFmadFtz = false;
2058 if (Subtarget->isGCN()) {
2060 UseFmadFtz =
2062 }
2063
2064 // float fr = mad(fqneg, fb, fa);
2065 unsigned OpCode = !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
2066 : UseFmadFtz ? (unsigned)AMDGPUISD::FMAD_FTZ
2068 SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa);
2069
2070 // int iq = (int)fq;
2071 SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
2072
2073 // fr = fabs(fr);
2074 fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
2075
2076 // fb = fabs(fb);
2077 fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
2078
2079 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2080
2081 // int cv = fr >= fb;
2082 SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
2083
2084 // jq = (cv ? jq : 0);
2085 jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));
2086
2087 // dst = iq + jq;
2088 SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
2089
2090 // Rem needs compensation, it's easier to recompute it
2091 SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
2092 Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
2093
2094 // Truncate to number of bits this divide really is.
2095 if (Sign) {
2096 SDValue InRegSize
2097 = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits));
2098 Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize);
2099 Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize);
2100 } else {
2101 SDValue TruncMask = DAG.getConstant((UINT64_C(1) << DivBits) - 1, DL, VT);
2102 Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask);
2103 Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask);
2104 }
2105
2106 return DAG.getMergeValues({ Div, Rem }, DL);
2107}
2108
2110 SelectionDAG &DAG,
2112 SDLoc DL(Op);
2113 EVT VT = Op.getValueType();
2114
2115 assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64");
2116
2117 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
2118
2119 SDValue One = DAG.getConstant(1, DL, HalfVT);
2120 SDValue Zero = DAG.getConstant(0, DL, HalfVT);
2121
2122 //HiLo split
2123 SDValue LHS_Lo, LHS_Hi;
2124 SDValue LHS = Op.getOperand(0);
2125 std::tie(LHS_Lo, LHS_Hi) = DAG.SplitScalar(LHS, DL, HalfVT, HalfVT);
2126
2127 SDValue RHS_Lo, RHS_Hi;
2128 SDValue RHS = Op.getOperand(1);
2129 std::tie(RHS_Lo, RHS_Hi) = DAG.SplitScalar(RHS, DL, HalfVT, HalfVT);
2130
2131 if (DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
2133
2134 SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2135 LHS_Lo, RHS_Lo);
2136
2137 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), Zero});
2138 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), Zero});
2139
2140 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));
2141 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));
2142 return;
2143 }
2144
2145 if (isTypeLegal(MVT::i64)) {
2146 // The algorithm here is based on ideas from "Software Integer Division",
2147 // Tom Rodeheffer, August 2008.
2148
2151
2152 // Compute denominator reciprocal.
2153 unsigned FMAD =
2154 !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
2157 : (unsigned)AMDGPUISD::FMAD_FTZ;
2158
2159 SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo);
2160 SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi);
2161 SDValue Mad1 = DAG.getNode(FMAD, DL, MVT::f32, Cvt_Hi,
2162 DAG.getConstantFP(APInt(32, 0x4f800000).bitsToFloat(), DL, MVT::f32),
2163 Cvt_Lo);
2164 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, DL, MVT::f32, Mad1);
2165 SDValue Mul1 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Rcp,
2166 DAG.getConstantFP(APInt(32, 0x5f7ffffc).bitsToFloat(), DL, MVT::f32));
2167 SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Mul1,
2168 DAG.getConstantFP(APInt(32, 0x2f800000).bitsToFloat(), DL, MVT::f32));
2169 SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, MVT::f32, Mul2);
2170 SDValue Mad2 = DAG.getNode(FMAD, DL, MVT::f32, Trunc,
2171 DAG.getConstantFP(APInt(32, 0xcf800000).bitsToFloat(), DL, MVT::f32),
2172 Mul1);
2173 SDValue Rcp_Lo = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Mad2);
2174 SDValue Rcp_Hi = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Trunc);
2175 SDValue Rcp64 = DAG.getBitcast(VT,
2176 DAG.getBuildVector(MVT::v2i32, DL, {Rcp_Lo, Rcp_Hi}));
2177
2178 SDValue Zero64 = DAG.getConstant(0, DL, VT);
2179 SDValue One64 = DAG.getConstant(1, DL, VT);
2180 SDValue Zero1 = DAG.getConstant(0, DL, MVT::i1);
2181 SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1);
2182
2183 // First round of UNR (Unsigned integer Newton-Raphson).
2184 SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS);
2185 SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64);
2186 SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1);
2187 SDValue Mulhi1_Lo, Mulhi1_Hi;
2188 std::tie(Mulhi1_Lo, Mulhi1_Hi) =
2189 DAG.SplitScalar(Mulhi1, DL, HalfVT, HalfVT);
2190 SDValue Add1_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Lo,
2191 Mulhi1_Lo, Zero1);
2192 SDValue Add1_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Hi,
2193 Mulhi1_Hi, Add1_Lo.getValue(1));
2194 SDValue Add1 = DAG.getBitcast(VT,
2195 DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi}));
2196
2197 // Second round of UNR.
2198 SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1);
2199 SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2);
2200 SDValue Mulhi2_Lo, Mulhi2_Hi;
2201 std::tie(Mulhi2_Lo, Mulhi2_Hi) =
2202 DAG.SplitScalar(Mulhi2, DL, HalfVT, HalfVT);
2203 SDValue Add2_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Lo,
2204 Mulhi2_Lo, Zero1);
2205 SDValue Add2_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Hi,
2206 Mulhi2_Hi, Add2_Lo.getValue(1));
2207 SDValue Add2 = DAG.getBitcast(VT,
2208 DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi}));
2209
2210 SDValue Mulhi3 = DAG.getNode(ISD::MULHU, DL, VT, LHS, Add2);
2211
2212 SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3);
2213
2214 SDValue Mul3_Lo, Mul3_Hi;
2215 std::tie(Mul3_Lo, Mul3_Hi) = DAG.SplitScalar(Mul3, DL, HalfVT, HalfVT);
2216 SDValue Sub1_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Lo,
2217 Mul3_Lo, Zero1);
2218 SDValue Sub1_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Hi,
2219 Mul3_Hi, Sub1_Lo.getValue(1));
2220 SDValue Sub1_Mi = DAG.getNode(ISD::SUB, DL, HalfVT, LHS_Hi, Mul3_Hi);
2221 SDValue Sub1 = DAG.getBitcast(VT,
2222 DAG.getBuildVector(MVT::v2i32, DL, {Sub1_Lo, Sub1_Hi}));
2223
2224 SDValue MinusOne = DAG.getConstant(0xffffffffu, DL, HalfVT);
2225 SDValue C1 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, MinusOne, Zero,
2226 ISD::SETUGE);
2227 SDValue C2 = DAG.getSelectCC(DL, Sub1_Lo, RHS_Lo, MinusOne, Zero,
2228 ISD::SETUGE);
2229 SDValue C3 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, C2, C1, ISD::SETEQ);
2230
2231 // TODO: Here and below portions of the code can be enclosed into if/endif.
2232 // Currently control flow is unconditional and we have 4 selects after
2233 // potential endif to substitute PHIs.
2234
2235 // if C3 != 0 ...
2236 SDValue Sub2_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Lo,
2237 RHS_Lo, Zero1);
2238 SDValue Sub2_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Mi,
2239 RHS_Hi, Sub1_Lo.getValue(1));
2240 SDValue Sub2_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi,
2241 Zero, Sub2_Lo.getValue(1));
2242 SDValue Sub2 = DAG.getBitcast(VT,
2243 DAG.getBuildVector(MVT::v2i32, DL, {Sub2_Lo, Sub2_Hi}));
2244
2245 SDValue Add3 = DAG.getNode(ISD::ADD, DL, VT, Mulhi3, One64);
2246
2247 SDValue C4 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, MinusOne, Zero,
2248 ISD::SETUGE);
2249 SDValue C5 = DAG.getSelectCC(DL, Sub2_Lo, RHS_Lo, MinusOne, Zero,
2250 ISD::SETUGE);
2251 SDValue C6 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, C5, C4, ISD::SETEQ);
2252
2253 // if (C6 != 0)
2254 SDValue Add4 = DAG.getNode(ISD::ADD, DL, VT, Add3, One64);
2255
2256 SDValue Sub3_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Lo,
2257 RHS_Lo, Zero1);
2258 SDValue Sub3_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi,
2259 RHS_Hi, Sub2_Lo.getValue(1));
2260 SDValue Sub3_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub3_Mi,
2261 Zero, Sub3_Lo.getValue(1));
2262 SDValue Sub3 = DAG.getBitcast(VT,
2263 DAG.getBuildVector(MVT::v2i32, DL, {Sub3_Lo, Sub3_Hi}));
2264
2265 // endif C6
2266 // endif C3
2267
2268 SDValue Sel1 = DAG.getSelectCC(DL, C6, Zero, Add4, Add3, ISD::SETNE);
2269 SDValue Div = DAG.getSelectCC(DL, C3, Zero, Sel1, Mulhi3, ISD::SETNE);
2270
2271 SDValue Sel2 = DAG.getSelectCC(DL, C6, Zero, Sub3, Sub2, ISD::SETNE);
2272 SDValue Rem = DAG.getSelectCC(DL, C3, Zero, Sel2, Sub1, ISD::SETNE);
2273
2274 Results.push_back(Div);
2275 Results.push_back(Rem);
2276
2277 return;
2278 }
2279
2280 // r600 expandion.
2281 // Get Speculative values
2282 SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
2283 SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
2284
2285 SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, Zero, REM_Part, LHS_Hi, ISD::SETEQ);
2286 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, Zero});
2287 REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);
2288
2289 SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, Zero, DIV_Part, Zero, ISD::SETEQ);
2290 SDValue DIV_Lo = Zero;
2291
2292 const unsigned halfBitWidth = HalfVT.getSizeInBits();
2293
2294 for (unsigned i = 0; i < halfBitWidth; ++i) {
2295 const unsigned bitPos = halfBitWidth - i - 1;
2296 SDValue POS = DAG.getConstant(bitPos, DL, HalfVT);
2297 // Get value of high bit
2298 SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
2299 HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, One);
2300 HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
2301
2302 // Shift
2303 REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT));
2304 // Add LHS high bit
2305 REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
2306
2307 SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT);
2308 SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, Zero, ISD::SETUGE);
2309
2310 DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
2311
2312 // Update REM
2313 SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
2314 REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
2315 }
2316
2317 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});
2318 DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);
2319 Results.push_back(DIV);
2320 Results.push_back(REM);
2321}
2322
2324 SelectionDAG &DAG) const {
2325 SDLoc DL(Op);
2326 EVT VT = Op.getValueType();
2327
2328 if (VT == MVT::i64) {
2330 LowerUDIVREM64(Op, DAG, Results);
2331 return DAG.getMergeValues(Results, DL);
2332 }
2333
2334 if (VT == MVT::i32) {
2335 if (SDValue Res = LowerDIVREM24(Op, DAG, false))
2336 return Res;
2337 }
2338
2339 SDValue X = Op.getOperand(0);
2340 SDValue Y = Op.getOperand(1);
2341
2342 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2343 // algorithm used here.
2344
2345 // Initial estimate of inv(y).
2346 SDValue Z = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Y);
2347
2348 // One round of UNR.
2349 SDValue NegY = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Y);
2350 SDValue NegYZ = DAG.getNode(ISD::MUL, DL, VT, NegY, Z);
2351 Z = DAG.getNode(ISD::ADD, DL, VT, Z,
2352 DAG.getNode(ISD::MULHU, DL, VT, Z, NegYZ));
2353
2354 // Quotient/remainder estimate.
2355 SDValue Q = DAG.getNode(ISD::MULHU, DL, VT, X, Z);
2356 SDValue R =
2357 DAG.getNode(ISD::SUB, DL, VT, X, DAG.getNode(ISD::MUL, DL, VT, Q, Y));
2358
2359 // First quotient/remainder refinement.
2360 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2361 SDValue One = DAG.getConstant(1, DL, VT);
2362 SDValue Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2363 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2364 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2365 R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2366 DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2367
2368 // Second quotient/remainder refinement.
2369 Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2370 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2371 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2372 R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2373 DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2374
2375 return DAG.getMergeValues({Q, R}, DL);
2376}
2377
2379 SelectionDAG &DAG) const {
2380 SDLoc DL(Op);
2381 EVT VT = Op.getValueType();
2382
2383 SDValue LHS = Op.getOperand(0);
2384 SDValue RHS = Op.getOperand(1);
2385
2386 SDValue Zero = DAG.getConstant(0, DL, VT);
2387 SDValue NegOne = DAG.getAllOnesConstant(DL, VT);
2388
2389 if (VT == MVT::i32) {
2390 if (SDValue Res = LowerDIVREM24(Op, DAG, true))
2391 return Res;
2392 }
2393
2394 if (VT == MVT::i64 &&
2395 DAG.ComputeNumSignBits(LHS) > 32 &&
2396 DAG.ComputeNumSignBits(RHS) > 32) {
2397 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
2398
2399 //HiLo split
2400 SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
2401 SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
2402 SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2403 LHS_Lo, RHS_Lo);
2404 SDValue Res[2] = {
2405 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
2406 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
2407 };
2408 return DAG.getMergeValues(Res, DL);
2409 }
2410
2411 SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
2412 SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
2413 SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
2414 SDValue RSign = LHSign; // Remainder sign is the same as LHS
2415
2416 LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
2417 RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
2418
2419 LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
2420 RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
2421
2422 SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
2423 SDValue Rem = Div.getValue(1);
2424
2425 Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
2426 Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
2427
2428 Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
2429 Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
2430
2431 SDValue Res[2] = {
2432 Div,
2433 Rem
2434 };
2435 return DAG.getMergeValues(Res, DL);
2436}
2437
2438// (frem x, y) -> (fma (fneg (ftrunc (fdiv x, y))), y, x)
2440 SDLoc SL(Op);
2441 EVT VT = Op.getValueType();
2442 auto Flags = Op->getFlags();
2443 SDValue X = Op.getOperand(0);
2444 SDValue Y = Op.getOperand(1);
2445
2446 SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y, Flags);
2447 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, VT, Div, Flags);
2448 SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Trunc, Flags);
2449 // TODO: For f32 use FMAD instead if !hasFastFMA32?
2450 return DAG.getNode(ISD::FMA, SL, VT, Neg, Y, X, Flags);
2451}
2452
2454 SDLoc SL(Op);
2455 SDValue Src = Op.getOperand(0);
2456
2457 // result = trunc(src)
2458 // if (src > 0.0 && src != result)
2459 // result += 1.0
2460
2461 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2462
2463 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2464 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
2465
2466 EVT SetCCVT =
2467 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2468
2469 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
2470 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2471 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2472
2473 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
2474 // TODO: Should this propagate fast-math-flags?
2475 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2476}
2477
2479 SelectionDAG &DAG) {
2480 const unsigned FractBits = 52;
2481 const unsigned ExpBits = 11;
2482
2483 SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
2484 Hi,
2485 DAG.getConstant(FractBits - 32, SL, MVT::i32),
2486 DAG.getConstant(ExpBits, SL, MVT::i32));
2487 SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
2488 DAG.getConstant(1023, SL, MVT::i32));
2489
2490 return Exp;
2491}
2492
2494 SDLoc SL(Op);
2495 SDValue Src = Op.getOperand(0);
2496
2497 assert(Op.getValueType() == MVT::f64);
2498
2499 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2500
2501 // Extract the upper half, since this is where we will find the sign and
2502 // exponent.
2503 SDValue Hi = getHiHalf64(Src, DAG);
2504
2505 SDValue Exp = extractF64Exponent(Hi, SL, DAG);
2506
2507 const unsigned FractBits = 52;
2508
2509 // Extract the sign bit.
2510 const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32);
2511 SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
2512
2513 // Extend back to 64-bits.
2514 SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
2515 SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
2516
2517 SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
2518 const SDValue FractMask
2519 = DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64);
2520
2521 SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
2522 SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
2523 SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
2524
2525 EVT SetCCVT =
2526 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
2527
2528 const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);
2529
2530 SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
2531 SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
2532
2533 SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
2534 SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
2535
2536 return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
2537}
2538
2540 SelectionDAG &DAG) const {
2541 SDLoc SL(Op);
2542 SDValue Src = Op.getOperand(0);
2543
2544 assert(Op.getValueType() == MVT::f64);
2545
2546 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2547 SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
2548 SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
2549
2550 // TODO: Should this propagate fast-math-flags?
2551
2552 SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
2553 SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
2554
2555 SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
2556
2557 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2558 SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
2559
2560 EVT SetCCVT =
2561 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2562 SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
2563
2564 return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
2565}
2566
2568 SelectionDAG &DAG) const {
2569 // FNEARBYINT and FRINT are the same, except in their handling of FP
2570 // exceptions. Those aren't really meaningful for us, and OpenCL only has
2571 // rint, so just treat them as equivalent.
2572 return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), Op.getValueType(),
2573 Op.getOperand(0));
2574}
2575
2577 auto VT = Op.getValueType();
2578 auto Arg = Op.getOperand(0u);
2579 return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), VT, Arg);
2580}
2581
2582// XXX - May require not supporting f32 denormals?
2583
2584// Don't handle v2f16. The extra instructions to scalarize and repack around the
2585// compare and vselect end up producing worse code than scalarizing the whole
2586// operation.
2588 SDLoc SL(Op);
2589 SDValue X = Op.getOperand(0);
2590 EVT VT = Op.getValueType();
2591
2592 SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X);
2593
2594 // TODO: Should this propagate fast-math-flags?
2595
2596 SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T);
2597
2598 SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff);
2599
2600 const SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2601 const SDValue One = DAG.getConstantFP(1.0, SL, VT);
2602
2603 EVT SetCCVT =
2604 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2605
2606 const SDValue Half = DAG.getConstantFP(0.5, SL, VT);
2607 SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
2608 SDValue OneOrZeroFP = DAG.getNode(ISD::SELECT, SL, VT, Cmp, One, Zero);
2609
2610 SDValue SignedOffset = DAG.getNode(ISD::FCOPYSIGN, SL, VT, OneOrZeroFP, X);
2611 return DAG.getNode(ISD::FADD, SL, VT, T, SignedOffset);
2612}
2613
2615 SDLoc SL(Op);
2616 SDValue Src = Op.getOperand(0);
2617
2618 // result = trunc(src);
2619 // if (src < 0.0 && src != result)
2620 // result += -1.0.
2621
2622 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2623
2624 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2625 const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);
2626
2627 EVT SetCCVT =
2628 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2629
2630 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
2631 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2632 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2633
2634 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
2635 // TODO: Should this propagate fast-math-flags?
2636 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2637}
2638
2639/// Return true if it's known that \p Src can never be an f32 denormal value.
2641 switch (Src.getOpcode()) {
2642 case ISD::FP_EXTEND:
2643 return Src.getOperand(0).getValueType() == MVT::f16;
2644 case ISD::FP16_TO_FP:
2645 case ISD::FFREXP:
2646 return true;
2648 unsigned IntrinsicID = Src.getConstantOperandVal(0);
2649 switch (IntrinsicID) {
2650 case Intrinsic::amdgcn_frexp_mant:
2651 return true;
2652 default:
2653 return false;
2654 }
2655 }
2656 default:
2657 return false;
2658 }
2659
2660 llvm_unreachable("covered opcode switch");
2661}
2662
2664 SDNodeFlags Flags) {
2665 return Flags.hasApproximateFuncs();
2666}
2667
2669 SDValue Src,
2670 SDNodeFlags Flags) {
2671 return !valueIsKnownNeverF32Denorm(Src) &&
2672 DAG.getMachineFunction()
2675}
2676
2678 SDValue Src,
2679 SDNodeFlags Flags) const {
2680 SDLoc SL(Src);
2681 EVT VT = Src.getValueType();
2682 const fltSemantics &Semantics = VT.getFltSemantics();
2683 SDValue SmallestNormal =
2684 DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT);
2685
2686 // Want to scale denormals up, but negatives and 0 work just as well on the
2687 // scaled path.
2688 SDValue IsLtSmallestNormal = DAG.getSetCC(
2689 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src,
2690 SmallestNormal, ISD::SETOLT);
2691
2692 return IsLtSmallestNormal;
2693}
2694
2696 SDNodeFlags Flags) const {
2697 SDLoc SL(Src);
2698 EVT VT = Src.getValueType();
2699 const fltSemantics &Semantics = VT.getFltSemantics();
2700 SDValue Inf = DAG.getConstantFP(APFloat::getInf(Semantics), SL, VT);
2701
2702 SDValue Fabs = DAG.getNode(ISD::FABS, SL, VT, Src, Flags);
2703 SDValue IsFinite = DAG.getSetCC(
2704 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Fabs,
2705 Inf, ISD::SETOLT);
2706 return IsFinite;
2707}
2708
2709/// If denormal handling is required return the scaled input to FLOG2, and the
2710/// check for denormal range. Otherwise, return null values.
2711std::pair<SDValue, SDValue>
2713 SDValue Src, SDNodeFlags Flags) const {
2714 if (!needsDenormHandlingF32(DAG, Src, Flags))
2715 return {};
2716
2717 MVT VT = MVT::f32;
2718 const fltSemantics &Semantics = APFloat::IEEEsingle();
2719 SDValue SmallestNormal =
2720 DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT);
2721
2722 SDValue IsLtSmallestNormal = DAG.getSetCC(
2723 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src,
2724 SmallestNormal, ISD::SETOLT);
2725
2726 SDValue Scale32 = DAG.getConstantFP(0x1.0p+32, SL, VT);
2727 SDValue One = DAG.getConstantFP(1.0, SL, VT);
2728 SDValue ScaleFactor =
2729 DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, Scale32, One, Flags);
2730
2731 SDValue ScaledInput = DAG.getNode(ISD::FMUL, SL, VT, Src, ScaleFactor, Flags);
2732 return {ScaledInput, IsLtSmallestNormal};
2733}
2734
2736 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
2737 // If we have to handle denormals, scale up the input and adjust the result.
2738
2739 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
2740 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
2741
2742 SDLoc SL(Op);
2743 EVT VT = Op.getValueType();
2744 SDValue Src = Op.getOperand(0);
2745 SDNodeFlags Flags = Op->getFlags();
2746
2747 if (VT == MVT::f16) {
2748 // Nothing in half is a denormal when promoted to f32.
2749 assert(!Subtarget->has16BitInsts());
2750 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);
2751 SDValue Log = DAG.getNode(AMDGPUISD::LOG, SL, MVT::f32, Ext, Flags);
2752 return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,
2753 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
2754 }
2755
2756 auto [ScaledInput, IsLtSmallestNormal] =
2757 getScaledLogInput(DAG, SL, Src, Flags);
2758 if (!ScaledInput)
2759 return DAG.getNode(AMDGPUISD::LOG, SL, VT, Src, Flags);
2760
2761 SDValue Log2 = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags);
2762
2763 SDValue ThirtyTwo = DAG.getConstantFP(32.0, SL, VT);
2764 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2765 SDValue ResultOffset =
2766 DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, ThirtyTwo, Zero);
2767 return DAG.getNode(ISD::FSUB, SL, VT, Log2, ResultOffset, Flags);
2768}
2769
2770static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X,
2771 SDValue Y, SDValue C, SDNodeFlags Flags = SDNodeFlags()) {
2772 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Y, Flags);
2773 return DAG.getNode(ISD::FADD, SL, VT, Mul, C, Flags);
2774}
2775
2777 SelectionDAG &DAG) const {
2778 SDValue X = Op.getOperand(0);
2779 EVT VT = Op.getValueType();
2780 SDNodeFlags Flags = Op->getFlags();
2781 SDLoc DL(Op);
2782
2783 const bool IsLog10 = Op.getOpcode() == ISD::FLOG10;
2784 assert(IsLog10 || Op.getOpcode() == ISD::FLOG);
2785
2786 const auto &Options = getTargetMachine().Options;
2787 if (VT == MVT::f16 || Flags.hasApproximateFuncs()) {
2788
2789 if (VT == MVT::f16 && !Subtarget->has16BitInsts()) {
2790 // Log and multiply in f32 is good enough for f16.
2791 X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X, Flags);
2792 }
2793
2794 SDValue Lowered = LowerFLOGUnsafe(X, DL, DAG, IsLog10, Flags);
2795 if (VT == MVT::f16 && !Subtarget->has16BitInsts()) {
2796 return DAG.getNode(ISD::FP_ROUND, DL, VT, Lowered,
2797 DAG.getTargetConstant(0, DL, MVT::i32), Flags);
2798 }
2799
2800 return Lowered;
2801 }
2802
2803 auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, DL, X, Flags);
2804 if (ScaledInput)
2805 X = ScaledInput;
2806
2807 SDValue Y = DAG.getNode(AMDGPUISD::LOG, DL, VT, X, Flags);
2808
2809 SDValue R;
2810 if (Subtarget->hasFastFMAF32()) {
2811 // c+cc are ln(2)/ln(10) to more than 49 bits
2812 const float c_log10 = 0x1.344134p-2f;
2813 const float cc_log10 = 0x1.09f79ep-26f;
2814
2815 // c + cc is ln(2) to more than 49 bits
2816 const float c_log = 0x1.62e42ep-1f;
2817 const float cc_log = 0x1.efa39ep-25f;
2818
2819 SDValue C = DAG.getConstantFP(IsLog10 ? c_log10 : c_log, DL, VT);
2820 SDValue CC = DAG.getConstantFP(IsLog10 ? cc_log10 : cc_log, DL, VT);
2821
2822 R = DAG.getNode(ISD::FMUL, DL, VT, Y, C, Flags);
2823 SDValue NegR = DAG.getNode(ISD::FNEG, DL, VT, R, Flags);
2824 SDValue FMA0 = DAG.getNode(ISD::FMA, DL, VT, Y, C, NegR, Flags);
2825 SDValue FMA1 = DAG.getNode(ISD::FMA, DL, VT, Y, CC, FMA0, Flags);
2826 R = DAG.getNode(ISD::FADD, DL, VT, R, FMA1, Flags);
2827 } else {
2828 // ch+ct is ln(2)/ln(10) to more than 36 bits
2829 const float ch_log10 = 0x1.344000p-2f;
2830 const float ct_log10 = 0x1.3509f6p-18f;
2831
2832 // ch + ct is ln(2) to more than 36 bits
2833 const float ch_log = 0x1.62e000p-1f;
2834 const float ct_log = 0x1.0bfbe8p-15f;
2835
2836 SDValue CH = DAG.getConstantFP(IsLog10 ? ch_log10 : ch_log, DL, VT);
2837 SDValue CT = DAG.getConstantFP(IsLog10 ? ct_log10 : ct_log, DL, VT);
2838
2839 SDValue YAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Y);
2840 SDValue MaskConst = DAG.getConstant(0xfffff000, DL, MVT::i32);
2841 SDValue YHInt = DAG.getNode(ISD::AND, DL, MVT::i32, YAsInt, MaskConst);
2842 SDValue YH = DAG.getNode(ISD::BITCAST, DL, MVT::f32, YHInt);
2843 SDValue YT = DAG.getNode(ISD::FSUB, DL, VT, Y, YH, Flags);
2844
2845 SDValue YTCT = DAG.getNode(ISD::FMUL, DL, VT, YT, CT, Flags);
2846 SDValue Mad0 = getMad(DAG, DL, VT, YH, CT, YTCT, Flags);
2847 SDValue Mad1 = getMad(DAG, DL, VT, YT, CH, Mad0, Flags);
2848 R = getMad(DAG, DL, VT, YH, CH, Mad1);
2849 }
2850
2851 const bool IsFiniteOnly = (Flags.hasNoNaNs() || Options.NoNaNsFPMath) &&
2852 (Flags.hasNoInfs() || Options.NoInfsFPMath);
2853
2854 // TODO: Check if known finite from source value.
2855 if (!IsFiniteOnly) {
2856 SDValue IsFinite = getIsFinite(DAG, Y, Flags);
2857 R = DAG.getNode(ISD::SELECT, DL, VT, IsFinite, R, Y, Flags);
2858 }
2859
2860 if (IsScaled) {
2861 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
2862 SDValue ShiftK =
2863 DAG.getConstantFP(IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f, DL, VT);
2864 SDValue Shift =
2865 DAG.getNode(ISD::SELECT, DL, VT, IsScaled, ShiftK, Zero, Flags);
2866 R = DAG.getNode(ISD::FSUB, DL, VT, R, Shift, Flags);
2867 }
2868
2869 return R;
2870}
2871
2873 return LowerFLOGCommon(Op, DAG);
2874}
2875
2876// Do f32 fast math expansion for flog2 or flog10. This is accurate enough for a
2877// promote f16 operation.
2879 SelectionDAG &DAG, bool IsLog10,
2880 SDNodeFlags Flags) const {
2881 EVT VT = Src.getValueType();
2882 unsigned LogOp =
2883 VT == MVT::f32 ? (unsigned)AMDGPUISD::LOG : (unsigned)ISD::FLOG2;
2884
2885 double Log2BaseInverted =
2887
2888 if (VT == MVT::f32) {
2889 auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, SL, Src, Flags);
2890 if (ScaledInput) {
2891 SDValue LogSrc = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags);
2892 SDValue ScaledResultOffset =
2893 DAG.getConstantFP(-32.0 * Log2BaseInverted, SL, VT);
2894
2895 SDValue Zero = DAG.getConstantFP(0.0f, SL, VT);
2896
2897 SDValue ResultOffset = DAG.getNode(ISD::SELECT, SL, VT, IsScaled,
2898 ScaledResultOffset, Zero, Flags);
2899
2900 SDValue Log2Inv = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2901
2902 if (Subtarget->hasFastFMAF32())
2903 return DAG.getNode(ISD::FMA, SL, VT, LogSrc, Log2Inv, ResultOffset,
2904 Flags);
2905 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, LogSrc, Log2Inv, Flags);
2906 return DAG.getNode(ISD::FADD, SL, VT, Mul, ResultOffset);
2907 }
2908 }
2909
2910 SDValue Log2Operand = DAG.getNode(LogOp, SL, VT, Src, Flags);
2911 SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2912
2913 return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand,
2914 Flags);
2915}
2916
2918 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
2919 // If we have to handle denormals, scale up the input and adjust the result.
2920
2921 SDLoc SL(Op);
2922 EVT VT = Op.getValueType();
2923 SDValue Src = Op.getOperand(0);
2924 SDNodeFlags Flags = Op->getFlags();
2925
2926 if (VT == MVT::f16) {
2927 // Nothing in half is a denormal when promoted to f32.
2928 assert(!Subtarget->has16BitInsts());
2929 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);
2930 SDValue Log = DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Ext, Flags);
2931 return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,
2932 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
2933 }
2934
2935 assert(VT == MVT::f32);
2936
2937 if (!needsDenormHandlingF32(DAG, Src, Flags))
2938 return DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Src, Flags);
2939
2940 // bool needs_scaling = x < -0x1.f80000p+6f;
2941 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
2942
2943 // -nextafter(128.0, -1)
2944 SDValue RangeCheckConst = DAG.getConstantFP(-0x1.f80000p+6f, SL, VT);
2945
2946 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2947
2948 SDValue NeedsScaling =
2949 DAG.getSetCC(SL, SetCCVT, Src, RangeCheckConst, ISD::SETOLT);
2950
2951 SDValue SixtyFour = DAG.getConstantFP(0x1.0p+6f, SL, VT);
2952 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2953
2954 SDValue AddOffset =
2955 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, SixtyFour, Zero);
2956
2957 SDValue AddInput = DAG.getNode(ISD::FADD, SL, VT, Src, AddOffset, Flags);
2958 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, AddInput, Flags);
2959
2960 SDValue TwoExpNeg64 = DAG.getConstantFP(0x1.0p-64f, SL, VT);
2961 SDValue One = DAG.getConstantFP(1.0, SL, VT);
2962 SDValue ResultScale =
2963 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, TwoExpNeg64, One);
2964
2965 return DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScale, Flags);
2966}
2967
2969 SelectionDAG &DAG,
2970 SDNodeFlags Flags) const {
2971 EVT VT = X.getValueType();
2972 const SDValue Log2E = DAG.getConstantFP(numbers::log2e, SL, VT);
2973
2974 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) {
2975 // exp2(M_LOG2E_F * f);
2976 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Log2E, Flags);
2977 return DAG.getNode(VT == MVT::f32 ? (unsigned)AMDGPUISD::EXP
2978 : (unsigned)ISD::FEXP2,
2979 SL, VT, Mul, Flags);
2980 }
2981
2982 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2983
2984 SDValue Threshold = DAG.getConstantFP(-0x1.5d58a0p+6f, SL, VT);
2985 SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT);
2986
2987 SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+6f, SL, VT);
2988
2989 SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags);
2990
2991 SDValue AdjustedX =
2992 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X);
2993
2994 SDValue ExpInput = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, Log2E, Flags);
2995
2996 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, ExpInput, Flags);
2997
2998 SDValue ResultScaleFactor = DAG.getConstantFP(0x1.969d48p-93f, SL, VT);
2999 SDValue AdjustedResult =
3000 DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScaleFactor, Flags);
3001
3002 return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, Exp2,
3003 Flags);
3004}
3005
3006/// Emit approx-funcs appropriate lowering for exp10. inf/nan should still be
3007/// handled correctly.
3009 SelectionDAG &DAG,
3010 SDNodeFlags Flags) const {
3011 const EVT VT = X.getValueType();
3012 const unsigned Exp2Op = VT == MVT::f32 ? static_cast<unsigned>(AMDGPUISD::EXP)
3013 : static_cast<unsigned>(ISD::FEXP2);
3014
3015 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) {
3016 // exp2(x * 0x1.a92000p+1f) * exp2(x * 0x1.4f0978p-11f);
3017 SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT);
3018 SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT);
3019
3020 SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, X, K0, Flags);
3021 SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags);
3022 SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, X, K1, Flags);
3023 SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags);
3024 return DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1);
3025 }
3026
3027 // bool s = x < -0x1.2f7030p+5f;
3028 // x += s ? 0x1.0p+5f : 0.0f;
3029 // exp10 = exp2(x * 0x1.a92000p+1f) *
3030 // exp2(x * 0x1.4f0978p-11f) *
3031 // (s ? 0x1.9f623ep-107f : 1.0f);
3032
3033 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
3034
3035 SDValue Threshold = DAG.getConstantFP(-0x1.2f7030p+5f, SL, VT);
3036 SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT);
3037
3038 SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+5f, SL, VT);
3039 SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags);
3040 SDValue AdjustedX =
3041 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X);
3042
3043 SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT);
3044 SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT);
3045
3046 SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K0, Flags);
3047 SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags);
3048 SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K1, Flags);
3049 SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags);
3050
3051 SDValue MulExps = DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1, Flags);
3052
3053 SDValue ResultScaleFactor = DAG.getConstantFP(0x1.9f623ep-107f, SL, VT);
3054 SDValue AdjustedResult =
3055 DAG.getNode(ISD::FMUL, SL, VT, MulExps, ResultScaleFactor, Flags);
3056
3057 return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, MulExps,
3058 Flags);
3059}
3060
3062 EVT VT = Op.getValueType();
3063 SDLoc SL(Op);
3064 SDValue X = Op.getOperand(0);
3065 SDNodeFlags Flags = Op->getFlags();
3066 const bool IsExp10 = Op.getOpcode() == ISD::FEXP10;
3067
3068 if (VT.getScalarType() == MVT::f16) {
3069 // v_exp_f16 (fmul x, log2e)
3070 if (allowApproxFunc(DAG, Flags)) // TODO: Does this really require fast?
3071 return lowerFEXPUnsafe(X, SL, DAG, Flags);
3072
3073 if (VT.isVector())
3074 return SDValue();
3075
3076 // exp(f16 x) ->
3077 // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
3078
3079 // Nothing in half is a denormal when promoted to f32.
3080 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, X, Flags);
3081 SDValue Lowered = lowerFEXPUnsafe(Ext, SL, DAG, Flags);
3082 return DAG.getNode(ISD::FP_ROUND, SL, VT, Lowered,
3083 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
3084 }
3085
3086 assert(VT == MVT::f32);
3087
3088 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
3089 // library behavior. Also, is known-not-daz source sufficient?
3090 if (allowApproxFunc(DAG, Flags)) {
3091 return IsExp10 ? lowerFEXP10Unsafe(X, SL, DAG, Flags)
3092 : lowerFEXPUnsafe(X, SL, DAG, Flags);
3093 }
3094
3095 // Algorithm:
3096 //
3097 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
3098 //
3099 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
3100 // n = 64*m + j, 0 <= j < 64
3101 //
3102 // e^x = 2^((64*m + j + f)/64)
3103 // = (2^m) * (2^(j/64)) * 2^(f/64)
3104 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
3105 //
3106 // f = x*(64/ln(2)) - n
3107 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
3108 //
3109 // e^x = (2^m) * (2^(j/64)) * e^r
3110 //
3111 // (2^(j/64)) is precomputed
3112 //
3113 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3114 // e^r = 1 + q
3115 //
3116 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3117 //
3118 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
3119 SDNodeFlags FlagsNoContract = Flags;
3120 FlagsNoContract.setAllowContract(false);
3121
3122 SDValue PH, PL;
3123 if (Subtarget->hasFastFMAF32()) {
3124 const float c_exp = numbers::log2ef;
3125 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
3126 const float c_exp10 = 0x1.a934f0p+1f;
3127 const float cc_exp10 = 0x1.2f346ep-24f;
3128
3129 SDValue C = DAG.getConstantFP(IsExp10 ? c_exp10 : c_exp, SL, VT);
3130 SDValue CC = DAG.getConstantFP(IsExp10 ? cc_exp10 : cc_exp, SL, VT);
3131
3132 PH = DAG.getNode(ISD::FMUL, SL, VT, X, C, Flags);
3133 SDValue NegPH = DAG.getNode(ISD::FNEG, SL, VT, PH, Flags);
3134 SDValue FMA0 = DAG.getNode(ISD::FMA, SL, VT, X, C, NegPH, Flags);
3135 PL = DAG.getNode(ISD::FMA, SL, VT, X, CC, FMA0, Flags);
3136 } else {
3137 const float ch_exp = 0x1.714000p+0f;
3138 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
3139
3140 const float ch_exp10 = 0x1.a92000p+1f;
3141 const float cl_exp10 = 0x1.4f0978p-11f;
3142
3143 SDValue CH = DAG.getConstantFP(IsExp10 ? ch_exp10 : ch_exp, SL, VT);
3144 SDValue CL = DAG.getConstantFP(IsExp10 ? cl_exp10 : cl_exp, SL, VT);
3145
3146 SDValue XAsInt = DAG.getNode(ISD::BITCAST, SL, MVT::i32, X);
3147 SDValue MaskConst = DAG.getConstant(0xfffff000, SL, MVT::i32);
3148 SDValue XHAsInt = DAG.getNode(ISD::AND, SL, MVT::i32, XAsInt, MaskConst);
3149 SDValue XH = DAG.getNode(ISD::BITCAST, SL, VT, XHAsInt);
3150 SDValue XL = DAG.getNode(ISD::FSUB, SL, VT, X, XH, Flags);
3151
3152 PH = DAG.getNode(ISD::FMUL, SL, VT, XH, CH, Flags);
3153
3154 SDValue XLCL = DAG.getNode(ISD::FMUL, SL, VT, XL, CL, Flags);
3155 SDValue Mad0 = getMad(DAG, SL, VT, XL, CH, XLCL, Flags);
3156 PL = getMad(DAG, SL, VT, XH, CL, Mad0, Flags);
3157 }
3158
3159 SDValue E = DAG.getNode(ISD::FROUNDEVEN, SL, VT, PH, Flags);
3160
3161 // It is unsafe to contract this fsub into the PH multiply.
3162 SDValue PHSubE = DAG.getNode(ISD::FSUB, SL, VT, PH, E, FlagsNoContract);
3163
3164 SDValue A = DAG.getNode(ISD::FADD, SL, VT, PHSubE, PL, Flags);
3165 SDValue IntE = DAG.getNode(ISD::FP_TO_SINT, SL, MVT::i32, E);
3166 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, A, Flags);
3167
3168 SDValue R = DAG.getNode(ISD::FLDEXP, SL, VT, Exp2, IntE, Flags);
3169
3170 SDValue UnderflowCheckConst =
3171 DAG.getConstantFP(IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f, SL, VT);
3172
3173 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
3174 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
3175 SDValue Underflow =
3176 DAG.getSetCC(SL, SetCCVT, X, UnderflowCheckConst, ISD::SETOLT);
3177
3178 R = DAG.getNode(ISD::SELECT, SL, VT, Underflow, Zero, R);
3179 const auto &Options = getTargetMachine().Options;
3180
3181 if (!Flags.hasNoInfs() && !Options.NoInfsFPMath) {
3182 SDValue OverflowCheckConst =
3183 DAG.getConstantFP(IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f, SL, VT);
3184 SDValue Overflow =
3185 DAG.getSetCC(SL, SetCCVT, X, OverflowCheckConst, ISD::SETOGT);
3186 SDValue Inf =
3188 R = DAG.getNode(ISD::SELECT, SL, VT, Overflow, Inf, R);
3189 }
3190
3191 return R;
3192}
3193
3194static bool isCtlzOpc(unsigned Opc) {
3195 return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
3196}
3197
3198static bool isCttzOpc(unsigned Opc) {
3199 return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF;
3200}
3201
3203 SelectionDAG &DAG) const {
3204 auto SL = SDLoc(Op);
3205 auto Opc = Op.getOpcode();
3206 auto Arg = Op.getOperand(0u);
3207 auto ResultVT = Op.getValueType();
3208
3209 if (ResultVT != MVT::i8 && ResultVT != MVT::i16)
3210 return {};
3211
3213 assert(ResultVT == Arg.getValueType());
3214
3215 const uint64_t NumBits = ResultVT.getFixedSizeInBits();
3216 SDValue NumExtBits = DAG.getConstant(32u - NumBits, SL, MVT::i32);
3217 SDValue NewOp;
3218
3219 if (Opc == ISD::CTLZ_ZERO_UNDEF) {
3220 NewOp = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Arg);
3221 NewOp = DAG.getNode(ISD::SHL, SL, MVT::i32, NewOp, NumExtBits);
3222 NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp);
3223 } else {
3224 NewOp = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Arg);
3225 NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp);
3226 NewOp = DAG.getNode(ISD::SUB, SL, MVT::i32, NewOp, NumExtBits);
3227 }
3228
3229 return DAG.getNode(ISD::TRUNCATE, SL, ResultVT, NewOp);
3230}
3231
3233 SDLoc SL(Op);
3234 SDValue Src = Op.getOperand(0);
3235
3236 assert(isCtlzOpc(Op.getOpcode()) || isCttzOpc(Op.getOpcode()));
3237 bool Ctlz = isCtlzOpc(Op.getOpcode());
3238 unsigned NewOpc = Ctlz ? AMDGPUISD::FFBH_U32 : AMDGPUISD::FFBL_B32;
3239
3240 bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF ||
3241 Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF;
3242 bool Is64BitScalar = !Src->isDivergent() && Src.getValueType() == MVT::i64;
3243
3244 if (Src.getValueType() == MVT::i32 || Is64BitScalar) {
3245 // (ctlz hi:lo) -> (umin (ffbh src), 32)
3246 // (cttz hi:lo) -> (umin (ffbl src), 32)
3247 // (ctlz_zero_undef src) -> (ffbh src)
3248 // (cttz_zero_undef src) -> (ffbl src)
3249
3250 // 64-bit scalar version produce 32-bit result
3251 // (ctlz hi:lo) -> (umin (S_FLBIT_I32_B64 src), 64)
3252 // (cttz hi:lo) -> (umin (S_FF1_I32_B64 src), 64)
3253 // (ctlz_zero_undef src) -> (S_FLBIT_I32_B64 src)
3254 // (cttz_zero_undef src) -> (S_FF1_I32_B64 src)
3255 SDValue NewOpr = DAG.getNode(NewOpc, SL, MVT::i32, Src);
3256 if (!ZeroUndef) {
3257 const SDValue ConstVal = DAG.getConstant(
3258 Op.getValueType().getScalarSizeInBits(), SL, MVT::i32);
3259 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, ConstVal);
3260 }
3261 return DAG.getNode(ISD::ZERO_EXTEND, SL, Src.getValueType(), NewOpr);
3262 }
3263
3264 SDValue Lo, Hi;
3265 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3266
3267 SDValue OprLo = DAG.getNode(NewOpc, SL, MVT::i32, Lo);
3268 SDValue OprHi = DAG.getNode(NewOpc, SL, MVT::i32, Hi);
3269
3270 // (ctlz hi:lo) -> (umin3 (ffbh hi), (uaddsat (ffbh lo), 32), 64)
3271 // (cttz hi:lo) -> (umin3 (uaddsat (ffbl hi), 32), (ffbl lo), 64)
3272 // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
3273 // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
3274
3275 unsigned AddOpc = ZeroUndef ? ISD::ADD : ISD::UADDSAT;
3276 const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32);
3277 if (Ctlz)
3278 OprLo = DAG.getNode(AddOpc, SL, MVT::i32, OprLo, Const32);
3279 else
3280 OprHi = DAG.getNode(AddOpc, SL, MVT::i32, OprHi, Const32);
3281
3282 SDValue NewOpr;
3283 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, OprLo, OprHi);
3284 if (!ZeroUndef) {
3285 const SDValue Const64 = DAG.getConstant(64, SL, MVT::i32);
3286 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const64);
3287 }
3288
3289 return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr);
3290}
3291
3293 bool Signed) const {
3294 // The regular method converting a 64-bit integer to float roughly consists of
3295 // 2 steps: normalization and rounding. In fact, after normalization, the
3296 // conversion from a 64-bit integer to a float is essentially the same as the
3297 // one from a 32-bit integer. The only difference is that it has more
3298 // trailing bits to be rounded. To leverage the native 32-bit conversion, a
3299 // 64-bit integer could be preprocessed and fit into a 32-bit integer then
3300 // converted into the correct float number. The basic steps for the unsigned
3301 // conversion are illustrated in the following pseudo code:
3302 //
3303 // f32 uitofp(i64 u) {
3304 // i32 hi, lo = split(u);
3305 // // Only count the leading zeros in hi as we have native support of the
3306 // // conversion from i32 to f32. If hi is all 0s, the conversion is
3307 // // reduced to a 32-bit one automatically.
3308 // i32 shamt = clz(hi); // Return 32 if hi is all 0s.
3309 // u <<= shamt;
3310 // hi, lo = split(u);
3311 // hi |= (lo != 0) ? 1 : 0; // Adjust rounding bit in hi based on lo.
3312 // // convert it as a 32-bit integer and scale the result back.
3313 // return uitofp(hi) * 2^(32 - shamt);
3314 // }
3315 //
3316 // The signed one follows the same principle but uses 'ffbh_i32' to count its
3317 // sign bits instead. If 'ffbh_i32' is not available, its absolute value is
3318 // converted instead followed by negation based its sign bit.
3319
3320 SDLoc SL(Op);
3321 SDValue Src = Op.getOperand(0);
3322
3323 SDValue Lo, Hi;
3324 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3325 SDValue Sign;
3326 SDValue ShAmt;
3327 if (Signed && Subtarget->isGCN()) {
3328 // We also need to consider the sign bit in Lo if Hi has just sign bits,
3329 // i.e. Hi is 0 or -1. However, that only needs to take the MSB into
3330 // account. That is, the maximal shift is
3331 // - 32 if Lo and Hi have opposite signs;
3332 // - 33 if Lo and Hi have the same sign.
3333 //
3334 // Or, MaxShAmt = 33 + OppositeSign, where
3335 //
3336 // OppositeSign is defined as ((Lo ^ Hi) >> 31), which is
3337 // - -1 if Lo and Hi have opposite signs; and
3338 // - 0 otherwise.
3339 //
3340 // All in all, ShAmt is calculated as
3341 //
3342 // umin(sffbh(Hi), 33 + (Lo^Hi)>>31) - 1.
3343 //
3344 // or
3345 //
3346 // umin(sffbh(Hi) - 1, 32 + (Lo^Hi)>>31).
3347 //
3348 // to reduce the critical path.
3349 SDValue OppositeSign = DAG.getNode(
3350 ISD::SRA, SL, MVT::i32, DAG.getNode(ISD::XOR, SL, MVT::i32, Lo, Hi),
3351 DAG.getConstant(31, SL, MVT::i32));
3352 SDValue MaxShAmt =
3353 DAG.getNode(ISD::ADD, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
3354 OppositeSign);
3355 // Count the leading sign bits.
3356 ShAmt = DAG.getNode(AMDGPUISD::FFBH_I32, SL, MVT::i32, Hi);
3357 // Different from unsigned conversion, the shift should be one bit less to
3358 // preserve the sign bit.
3359 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, ShAmt,
3360 DAG.getConstant(1, SL, MVT::i32));
3361 ShAmt = DAG.getNode(ISD::UMIN, SL, MVT::i32, ShAmt, MaxShAmt);
3362 } else {
3363 if (Signed) {
3364 // Without 'ffbh_i32', only leading zeros could be counted. Take the
3365 // absolute value first.
3366 Sign = DAG.getNode(ISD::SRA, SL, MVT::i64, Src,
3367 DAG.getConstant(63, SL, MVT::i64));
3368 SDValue Abs =
3369 DAG.getNode(ISD::XOR, SL, MVT::i64,
3370 DAG.getNode(ISD::ADD, SL, MVT::i64, Src, Sign), Sign);
3371 std::tie(Lo, Hi) = split64BitValue(Abs, DAG);
3372 }
3373 // Count the leading zeros.
3374 ShAmt = DAG.getNode(ISD::CTLZ, SL, MVT::i32, Hi);
3375 // The shift amount for signed integers is [0, 32].
3376 }
3377 // Normalize the given 64-bit integer.
3378 SDValue Norm = DAG.getNode(ISD::SHL, SL, MVT::i64, Src, ShAmt);
3379 // Split it again.
3380 std::tie(Lo, Hi) = split64BitValue(Norm, DAG);
3381 // Calculate the adjust bit for rounding.
3382 // (lo != 0) ? 1 : 0 => (lo >= 1) ? 1 : 0 => umin(1, lo)
3383 SDValue Adjust = DAG.getNode(ISD::UMIN, SL, MVT::i32,
3384 DAG.getConstant(1, SL, MVT::i32), Lo);
3385 // Get the 32-bit normalized integer.
3386 Norm = DAG.getNode(ISD::OR, SL, MVT::i32, Hi, Adjust);
3387 // Convert the normalized 32-bit integer into f32.
3388 unsigned Opc =
3389 (Signed && Subtarget->isGCN()) ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
3390 SDValue FVal = DAG.getNode(Opc, SL, MVT::f32, Norm);
3391
3392 // Finally, need to scale back the converted floating number as the original
3393 // 64-bit integer is converted as a 32-bit one.
3394 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
3395 ShAmt);
3396 // On GCN, use LDEXP directly.
3397 if (Subtarget->isGCN())
3398 return DAG.getNode(ISD::FLDEXP, SL, MVT::f32, FVal, ShAmt);
3399
3400 // Otherwise, align 'ShAmt' to the exponent part and add it into the exponent
3401 // part directly to emulate the multiplication of 2^ShAmt. That 8-bit
3402 // exponent is enough to avoid overflowing into the sign bit.
3403 SDValue Exp = DAG.getNode(ISD::SHL, SL, MVT::i32, ShAmt,
3404 DAG.getConstant(23, SL, MVT::i32));
3405 SDValue IVal =
3406 DAG.getNode(ISD::ADD, SL, MVT::i32,
3407 DAG.getNode(ISD::BITCAST, SL, MVT::i32, FVal), Exp);
3408 if (Signed) {
3409 // Set the sign bit.
3410 Sign = DAG.getNode(ISD::SHL, SL, MVT::i32,
3411 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Sign),
3412 DAG.getConstant(31, SL, MVT::i32));
3413 IVal = DAG.getNode(ISD::OR, SL, MVT::i32, IVal, Sign);
3414 }
3415 return DAG.getNode(ISD::BITCAST, SL, MVT::f32, IVal);
3416}
3417
3419 bool Signed) const {
3420 SDLoc SL(Op);
3421 SDValue Src = Op.getOperand(0);
3422
3423 SDValue Lo, Hi;
3424 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3425
3427 SL, MVT::f64, Hi);
3428
3429 SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
3430
3431 SDValue LdExp = DAG.getNode(ISD::FLDEXP, SL, MVT::f64, CvtHi,
3432 DAG.getConstant(32, SL, MVT::i32));
3433 // TODO: Should this propagate fast-math-flags?
3434 return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
3435}
3436
3438 SelectionDAG &DAG) const {
3439 // TODO: Factor out code common with LowerSINT_TO_FP.
3440 EVT DestVT = Op.getValueType();
3441 SDValue Src = Op.getOperand(0);
3442 EVT SrcVT = Src.getValueType();
3443
3444 if (SrcVT == MVT::i16) {
3445 if (DestVT == MVT::f16)
3446 return Op;
3447 SDLoc DL(Op);
3448
3449 // Promote src to i32
3450 SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Src);
3451 return DAG.getNode(ISD::UINT_TO_FP, DL, DestVT, Ext);
3452 }
3453
3454 if (DestVT == MVT::bf16) {
3455 SDLoc SL(Op);
3456 SDValue ToF32 = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f32, Src);
3457 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true);
3458 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag);
3459 }
3460
3461 if (SrcVT != MVT::i64)
3462 return Op;
3463
3464 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
3465 SDLoc DL(Op);
3466
3467 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
3468 SDValue FPRoundFlag =
3469 DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);
3470 SDValue FPRound =
3471 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
3472
3473 return FPRound;
3474 }
3475
3476 if (DestVT == MVT::f32)
3477 return LowerINT_TO_FP32(Op, DAG, false);
3478
3479 assert(DestVT == MVT::f64);
3480 return LowerINT_TO_FP64(Op, DAG, false);
3481}
3482
3484 SelectionDAG &DAG) const {
3485 EVT DestVT = Op.getValueType();
3486
3487 SDValue Src = Op.getOperand(0);
3488 EVT SrcVT = Src.getValueType();
3489
3490 if (SrcVT == MVT::i16) {
3491 if (DestVT == MVT::f16)
3492 return Op;
3493
3494 SDLoc DL(Op);
3495 // Promote src to i32
3496 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, Src);
3497 return DAG.getNode(ISD::SINT_TO_FP, DL, DestVT, Ext);
3498 }
3499
3500 if (DestVT == MVT::bf16) {
3501 SDLoc SL(Op);
3502 SDValue ToF32 = DAG.getNode(ISD::SINT_TO_FP, SL, MVT::f32, Src);
3503 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true);
3504 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag);
3505 }
3506
3507 if (SrcVT != MVT::i64)
3508 return Op;
3509
3510 // TODO: Factor out code common with LowerUINT_TO_FP.
3511
3512 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
3513 SDLoc DL(Op);
3514 SDValue Src = Op.getOperand(0);
3515
3516 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
3517 SDValue FPRoundFlag =
3518 DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);
3519 SDValue FPRound =
3520 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
3521
3522 return FPRound;
3523 }
3524
3525 if (DestVT == MVT::f32)
3526 return LowerINT_TO_FP32(Op, DAG, true);
3527
3528 assert(DestVT == MVT::f64);
3529 return LowerINT_TO_FP64(Op, DAG, true);
3530}
3531
3533 bool Signed) const {
3534 SDLoc SL(Op);
3535
3536 SDValue Src = Op.getOperand(0);
3537 EVT SrcVT = Src.getValueType();
3538
3539 assert(SrcVT == MVT::f32 || SrcVT == MVT::f64);
3540
3541 // The basic idea of converting a floating point number into a pair of 32-bit
3542 // integers is illustrated as follows:
3543 //
3544 // tf := trunc(val);
3545 // hif := floor(tf * 2^-32);
3546 // lof := tf - hif * 2^32; // lof is always positive due to floor.
3547 // hi := fptoi(hif);
3548 // lo := fptoi(lof);
3549 //
3550 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, SrcVT, Src);
3551 SDValue Sign;
3552 if (Signed && SrcVT == MVT::f32) {
3553 // However, a 32-bit floating point number has only 23 bits mantissa and
3554 // it's not enough to hold all the significant bits of `lof` if val is
3555 // negative. To avoid the loss of precision, We need to take the absolute
3556 // value after truncating and flip the result back based on the original
3557 // signedness.
3558 Sign = DAG.getNode(ISD::SRA, SL, MVT::i32,
3559 DAG.getNode(ISD::BITCAST, SL, MVT::i32, Trunc),
3560 DAG.getConstant(31, SL, MVT::i32));
3561 Trunc = DAG.getNode(ISD::FABS, SL, SrcVT, Trunc);
3562 }
3563
3564 SDValue K0, K1;
3565 if (SrcVT == MVT::f64) {
3566 K0 = DAG.getConstantFP(
3567 llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)), SL,
3568 SrcVT);
3569 K1 = DAG.getConstantFP(
3570 llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)), SL,
3571 SrcVT);
3572 } else {
3573 K0 = DAG.getConstantFP(
3574 llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)), SL, SrcVT);
3575 K1 = DAG.getConstantFP(
3576 llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)), SL, SrcVT);
3577 }
3578 // TODO: Should this propagate fast-math-flags?
3579 SDValue Mul = DAG.getNode(ISD::FMUL, SL, SrcVT, Trunc, K0);
3580
3581 SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, SrcVT, Mul);
3582
3583 SDValue Fma = DAG.getNode(ISD::FMA, SL, SrcVT, FloorMul, K1, Trunc);
3584
3585 SDValue Hi = DAG.getNode((Signed && SrcVT == MVT::f64) ? ISD::FP_TO_SINT
3587 SL, MVT::i32, FloorMul);
3588 SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
3589
3590 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
3591 DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi}));
3592
3593 if (Signed && SrcVT == MVT::f32) {
3594 assert(Sign);
3595 // Flip the result based on the signedness, which is either all 0s or 1s.
3596 Sign = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
3597 DAG.getBuildVector(MVT::v2i32, SL, {Sign, Sign}));
3598 // r := xor(r, sign) - sign;
3599 Result =
3600 DAG.getNode(ISD::SUB, SL, MVT::i64,
3601 DAG.getNode(ISD::XOR, SL, MVT::i64, Result, Sign), Sign);
3602 }
3603
3604 return Result;
3605}
3606
3608 SDLoc DL(Op);
3609 SDValue N0 = Op.getOperand(0);
3610
3611 // Convert to target node to get known bits
3612 if (N0.getValueType() == MVT::f32)
3613 return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0);
3614
3615 if (Op->getFlags().hasApproximateFuncs()) {
3616 // There is a generic expand for FP_TO_FP16 with unsafe fast math.
3617 return SDValue();
3618 }
3619
3620 return LowerF64ToF16Safe(N0, DL, DAG);
3621}
3622
3623// return node in i32
3625 SelectionDAG &DAG) const {
3626 assert(Src.getSimpleValueType() == MVT::f64);
3627
3628 // f64 -> f16 conversion using round-to-nearest-even rounding mode.
3629 // TODO: We can generate better code for True16.
3630 const unsigned ExpMask = 0x7ff;
3631 const unsigned ExpBiasf64 = 1023;
3632 const unsigned ExpBiasf16 = 15;
3633 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
3634 SDValue One = DAG.getConstant(1, DL, MVT::i32);
3635 SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Src);
3636 SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U,
3637 DAG.getConstant(32, DL, MVT::i64));
3638 UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32);
3639 U = DAG.getZExtOrTrunc(U, DL, MVT::i32);
3640 SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3641 DAG.getConstant(20, DL, MVT::i64));
3642 E = DAG.getNode(ISD::AND, DL, MVT::i32, E,
3643 DAG.getConstant(ExpMask, DL, MVT::i32));
3644 // Subtract the fp64 exponent bias (1023) to get the real exponent and
3645 // add the f16 bias (15) to get the biased exponent for the f16 format.
3646 E = DAG.getNode(ISD::ADD, DL, MVT::i32, E,
3647 DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32));
3648
3649 SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3650 DAG.getConstant(8, DL, MVT::i32));
3651 M = DAG.getNode(ISD::AND, DL, MVT::i32, M,
3652 DAG.getConstant(0xffe, DL, MVT::i32));
3653
3654 SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH,
3655 DAG.getConstant(0x1ff, DL, MVT::i32));
3656 MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U);
3657
3658 SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ);
3659 M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set);
3660
3661 // (M != 0 ? 0x0200 : 0) | 0x7c00;
3662 SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32,
3663 DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32),
3664 Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32));
3665
3666 // N = M | (E << 12);
3667 SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M,
3668 DAG.getNode(ISD::SHL, DL, MVT::i32, E,
3669 DAG.getConstant(12, DL, MVT::i32)));
3670
3671 // B = clamp(1-E, 0, 13);
3672 SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32,
3673 One, E);
3674 SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero);
3675 B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B,
3676 DAG.getConstant(13, DL, MVT::i32));
3677
3678 SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M,
3679 DAG.getConstant(0x1000, DL, MVT::i32));
3680
3681 SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B);
3682 SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B);
3683 SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE);
3684 D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1);
3685
3686 SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT);
3687 SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V,
3688 DAG.getConstant(0x7, DL, MVT::i32));
3689 V = DAG.getNode(ISD::SRL, DL, MVT::i32, V,
3690 DAG.getConstant(2, DL, MVT::i32));
3691 SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32),
3692 One, Zero, ISD::SETEQ);
3693 SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32),
3694 One, Zero, ISD::SETGT);
3695 V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1);
3696 V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1);
3697
3698 V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32),
3699 DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT);
3700 V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32),
3701 I, V, ISD::SETEQ);
3702
3703 // Extract the sign bit.
3704 SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3705 DAG.getConstant(16, DL, MVT::i32));
3706 Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign,
3707 DAG.getConstant(0x8000, DL, MVT::i32));
3708
3709 return DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V);
3710}
3711
3713 SelectionDAG &DAG) const {
3714 SDValue Src = Op.getOperand(0);
3715 unsigned OpOpcode = Op.getOpcode();
3716 EVT SrcVT = Src.getValueType();
3717 EVT DestVT = Op.getValueType();
3718
3719 // Will be selected natively
3720 if (SrcVT == MVT::f16 && DestVT == MVT::i16)
3721 return Op;
3722
3723 if (SrcVT == MVT::bf16) {
3724 SDLoc DL(Op);
3725 SDValue PromotedSrc = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
3726 return DAG.getNode(Op.getOpcode(), DL, DestVT, PromotedSrc);
3727 }
3728
3729 // Promote i16 to i32
3730 if (DestVT == MVT::i16 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
3731 SDLoc DL(Op);
3732
3733 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
3734 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToInt32);
3735 }
3736
3737 if (DestVT != MVT::i64)
3738 return Op;
3739
3740 if (SrcVT == MVT::f16 ||
3741 (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {
3742 SDLoc DL(Op);
3743
3744 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
3745 unsigned Ext =
3747 return DAG.getNode(Ext, DL, MVT::i64, FpToInt32);
3748 }
3749
3750 if (SrcVT == MVT::f32 || SrcVT == MVT::f64)
3751 return LowerFP_TO_INT64(Op, DAG, OpOpcode == ISD::FP_TO_SINT);
3752
3753 return SDValue();
3754}
3755
3757 SelectionDAG &DAG) const {
3758 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
3759 MVT VT = Op.getSimpleValueType();
3760 MVT ScalarVT = VT.getScalarType();
3761
3762 assert(VT.isVector());
3763
3764 SDValue Src = Op.getOperand(0);
3765 SDLoc DL(Op);
3766
3767 // TODO: Don't scalarize on Evergreen?
3768 unsigned NElts = VT.getVectorNumElements();
3770 DAG.ExtractVectorElements(Src, Args, 0, NElts);
3771
3772 SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
3773 for (unsigned I = 0; I < NElts; ++I)
3774 Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
3775
3776 return DAG.getBuildVector(VT, DL, Args);
3777}
3778
3779//===----------------------------------------------------------------------===//
3780// Custom DAG optimizations
3781//===----------------------------------------------------------------------===//
3782
3783static bool isU24(SDValue Op, SelectionDAG &DAG) {
3784 return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24;
3785}
3786
3787static bool isI24(SDValue Op, SelectionDAG &DAG) {
3788 EVT VT = Op.getValueType();
3789 return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
3790 // as unsigned 24-bit values.
3792}
3793
3796 SelectionDAG &DAG = DCI.DAG;
3797 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
3798 bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
3799
3800 SDValue LHS = IsIntrin ? Node24->getOperand(1) : Node24->getOperand(0);
3801 SDValue RHS = IsIntrin ? Node24->getOperand(2) : Node24->getOperand(1);
3802 unsigned NewOpcode = Node24->getOpcode();
3803 if (IsIntrin) {
3804 unsigned IID = Node24->getConstantOperandVal(0);
3805 switch (IID) {
3806 case Intrinsic::amdgcn_mul_i24:
3807 NewOpcode = AMDGPUISD::MUL_I24;
3808 break;
3809 case Intrinsic::amdgcn_mul_u24:
3810 NewOpcode = AMDGPUISD::MUL_U24;
3811 break;
3812 case Intrinsic::amdgcn_mulhi_i24:
3813 NewOpcode = AMDGPUISD::MULHI_I24;
3814 break;
3815 case Intrinsic::amdgcn_mulhi_u24:
3816 NewOpcode = AMDGPUISD::MULHI_U24;
3817 break;
3818 default:
3819 llvm_unreachable("Expected 24-bit mul intrinsic");
3820 }
3821 }
3822
3823 APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24);
3824
3825 // First try to simplify using SimplifyMultipleUseDemandedBits which allows
3826 // the operands to have other uses, but will only perform simplifications that
3827 // involve bypassing some nodes for this user.
3828 SDValue DemandedLHS = TLI.SimplifyMultipleUseDemandedBits(LHS, Demanded, DAG);
3829 SDValue DemandedRHS = TLI.SimplifyMultipleUseDemandedBits(RHS, Demanded, DAG);
3830 if (DemandedLHS || DemandedRHS)
3831 return DAG.getNode(NewOpcode, SDLoc(Node24), Node24->getVTList(),
3832 DemandedLHS ? DemandedLHS : LHS,
3833 DemandedRHS ? DemandedRHS : RHS);
3834
3835 // Now try SimplifyDemandedBits which can simplify the nodes used by our
3836 // operands if this node is the only user.
3837 if (TLI.SimplifyDemandedBits(LHS, Demanded, DCI))
3838 return SDValue(Node24, 0);
3839 if (TLI.SimplifyDemandedBits(RHS, Demanded, DCI))
3840 return SDValue(Node24, 0);
3841
3842 return SDValue();
3843}
3844
3845template <typename IntTy>
3847 uint32_t Width, const SDLoc &DL) {
3848 if (Width + Offset < 32) {
3849 uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
3850 IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
3851 if constexpr (std::is_signed_v<IntTy>) {
3852 return DAG.getSignedConstant(Result, DL, MVT::i32);
3853 } else {
3854 return DAG.getConstant(Result, DL, MVT::i32);
3855 }
3856 }
3857
3858 return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
3859}
3860
3861static bool hasVolatileUser(SDNode *Val) {
3862 for (SDNode *U : Val->users()) {
3863 if (MemSDNode *M = dyn_cast<MemSDNode>(U)) {
3864 if (M->isVolatile())
3865 return true;
3866 }
3867 }
3868
3869 return false;
3870}
3871
3873 // i32 vectors are the canonical memory type.
3874 if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
3875 return false;
3876
3877 if (!VT.isByteSized())
3878 return false;
3879
3880 unsigned Size = VT.getStoreSize();
3881
3882 if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
3883 return false;
3884
3885 if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
3886 return false;
3887
3888 return true;
3889}
3890
3891// Replace load of an illegal type with a bitcast from a load of a friendlier
3892// type.
3894 DAGCombinerInfo &DCI) const {
3895 if (!DCI.isBeforeLegalize())
3896 return SDValue();
3897
3898 LoadSDNode *LN = cast<LoadSDNode>(N);
3899 if (!LN->isSimple() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
3900 return SDValue();
3901
3902 SDLoc SL(N);
3903 SelectionDAG &DAG = DCI.DAG;
3904 EVT VT = LN->getMemoryVT();
3905
3906 unsigned Size = VT.getStoreSize();
3907 Align Alignment = LN->getAlign();
3908 if (Alignment < Size && isTypeLegal(VT)) {
3909 unsigned IsFast;
3910 unsigned AS = LN->getAddressSpace();
3911
3912 // Expand unaligned loads earlier than legalization. Due to visitation order
3913 // problems during legalization, the emitted instructions to pack and unpack
3914 // the bytes again are not eliminated in the case of an unaligned copy.
3916 VT, AS, Alignment, LN->getMemOperand()->getFlags(), &IsFast)) {
3917 if (VT.isVector())
3918 return SplitVectorLoad(SDValue(LN, 0), DAG);
3919
3920 SDValue Ops[2];
3921 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
3922
3923 return DAG.getMergeValues(Ops, SDLoc(N));
3924 }
3925
3926 if (!IsFast)
3927 return SDValue();
3928 }
3929
3930 if (!shouldCombineMemoryType(VT))
3931 return SDValue();
3932
3933 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3934
3935 SDValue NewLoad
3936 = DAG.getLoad(NewVT, SL, LN->getChain(),
3937 LN->getBasePtr(), LN->getMemOperand());
3938
3939 SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad);
3940 DCI.CombineTo(N, BC, NewLoad.getValue(1));
3941 return SDValue(N, 0);
3942}
3943
3944// Replace store of an illegal type with a store of a bitcast to a friendlier
3945// type.
3947 DAGCombinerInfo &DCI) const {
3948 if (!DCI.isBeforeLegalize())
3949 return SDValue();
3950
3951 StoreSDNode *SN = cast<StoreSDNode>(N);
3952 if (!SN->isSimple() || !ISD::isNormalStore(SN))
3953 return SDValue();
3954
3955 EVT VT = SN->getMemoryVT();
3956 unsigned Size = VT.getStoreSize();
3957
3958 SDLoc SL(N);
3959 SelectionDAG &DAG = DCI.DAG;
3960 Align Alignment = SN->getAlign();
3961 if (Alignment < Size && isTypeLegal(VT)) {
3962 unsigned IsFast;
3963 unsigned AS = SN->getAddressSpace();
3964
3965 // Expand unaligned stores earlier than legalization. Due to visitation
3966 // order problems during legalization, the emitted instructions to pack and
3967 // unpack the bytes again are not eliminated in the case of an unaligned
3968 // copy.
3970 VT, AS, Alignment, SN->getMemOperand()->getFlags(), &IsFast)) {
3971 if (VT.isVector())
3972 return SplitVectorStore(SDValue(SN, 0), DAG);
3973
3974 return expandUnalignedStore(SN, DAG);
3975 }
3976
3977 if (!IsFast)
3978 return SDValue();
3979 }
3980
3981 if (!shouldCombineMemoryType(VT))
3982 return SDValue();
3983
3984 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3985 SDValue Val = SN->getValue();
3986
3987 //DCI.AddToWorklist(Val.getNode());
3988
3989 bool OtherUses = !Val.hasOneUse();
3990 SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val);
3991 if (OtherUses) {
3992 SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal);
3993 DAG.ReplaceAllUsesOfValueWith(Val, CastBack);
3994 }
3995
3996 return DAG.getStore(SN->getChain(), SL, CastVal,
3997 SN->getBasePtr(), SN->getMemOperand());
3998}
3999
4000// FIXME: This should go in generic DAG combiner with an isTruncateFree check,
4001// but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
4002// issues.
4004 DAGCombinerInfo &DCI) const {
4005 SelectionDAG &DAG = DCI.DAG;
4006 SDValue N0 = N->getOperand(0);
4007
4008 // (vt2 (assertzext (truncate vt0:x), vt1)) ->
4009 // (vt2 (truncate (assertzext vt0:x, vt1)))
4010 if (N0.getOpcode() == ISD::TRUNCATE) {
4011 SDValue N1 = N->getOperand(1);
4012 EVT ExtVT = cast<VTSDNode>(N1)->getVT();
4013 SDLoc SL(N);
4014
4015 SDValue Src = N0.getOperand(0);
4016 EVT SrcVT = Src.getValueType();
4017 if (SrcVT.bitsGE(ExtVT)) {
4018 SDValue NewInReg = DAG.getNode(N->getOpcode(), SL, SrcVT, Src, N1);
4019 return DAG.getNode(ISD::TRUNCATE, SL, N->getValueType(0), NewInReg);
4020 }
4021 }
4022
4023 return SDValue();
4024}
4025
4027 SDNode *N, DAGCombinerInfo &DCI) const {
4028 unsigned IID = N->getConstantOperandVal(0);
4029 switch (IID) {
4030 case Intrinsic::amdgcn_mul_i24:
4031 case Intrinsic::amdgcn_mul_u24:
4032 case Intrinsic::amdgcn_mulhi_i24:
4033 case Intrinsic::amdgcn_mulhi_u24:
4034 return simplifyMul24(N, DCI);
4035 case Intrinsic::amdgcn_fract:
4036 case Intrinsic::amdgcn_rsq:
4037 case Intrinsic::amdgcn_rcp_legacy:
4038 case Intrinsic::amdgcn_rsq_legacy:
4039 case Intrinsic::amdgcn_rsq_clamp:
4040 case Intrinsic::amdgcn_tanh:
4041 case Intrinsic::amdgcn_prng_b32: {
4042 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
4043 SDValue Src = N->getOperand(1);
4044 return Src.isUndef() ? Src : SDValue();
4045 }
4046 case Intrinsic::amdgcn_frexp_exp: {
4047 // frexp_exp (fneg x) -> frexp_exp x
4048 // frexp_exp (fabs x) -> frexp_exp x
4049 // frexp_exp (fneg (fabs x)) -> frexp_exp x
4050 SDValue Src = N->getOperand(1);
4051 SDValue PeekSign = peekFPSignOps(Src);
4052 if (PeekSign == Src)
4053 return SDValue();
4054 return SDValue(DCI.DAG.UpdateNodeOperands(N, N->getOperand(0), PeekSign),
4055 0);
4056 }
4057 default:
4058 return SDValue();
4059 }
4060}
4061
4062/// Split the 64-bit value \p LHS into two 32-bit components, and perform the
4063/// binary operation \p Opc to it with the corresponding constant operands.
4065 DAGCombinerInfo &DCI, const SDLoc &SL,
4066 unsigned Opc, SDValue LHS,
4067 uint32_t ValLo, uint32_t ValHi) const {
4068 SelectionDAG &DAG = DCI.DAG;
4069 SDValue Lo, Hi;
4070 std::tie(Lo, Hi) = split64BitValue(LHS, DAG);
4071
4072 SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32);
4073 SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32);
4074
4075 SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS);
4076 SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS);
4077
4078 // Re-visit the ands. It's possible we eliminated one of them and it could
4079 // simplify the vector.
4080 DCI.AddToWorklist(Lo.getNode());
4081 DCI.AddToWorklist(Hi.getNode());
4082
4083 SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd});
4084 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
4085}
4086
4088 DAGCombinerInfo &DCI) const {
4089 EVT VT = N->getValueType(0);
4090 SDValue LHS = N->getOperand(0);
4091 SDValue RHS = N->getOperand(1);
4092 ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
4093 SDLoc SL(N);
4094 SelectionDAG &DAG = DCI.DAG;
4095
4096 unsigned RHSVal;
4097 if (CRHS) {
4098 RHSVal = CRHS->getZExtValue();
4099 if (!RHSVal)
4100 return LHS;
4101
4102 switch (LHS->getOpcode()) {
4103 default:
4104 break;
4105 case ISD::ZERO_EXTEND:
4106 case ISD::SIGN_EXTEND:
4107 case ISD::ANY_EXTEND: {
4108 SDValue X = LHS->getOperand(0);
4109
4110 if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 &&
4111 isOperationLegal(ISD::BUILD_VECTOR, MVT::v2i16)) {
4112 // Prefer build_vector as the canonical form if packed types are legal.
4113 // (shl ([asz]ext i16:x), 16 -> build_vector 0, x
4114 SDValue Vec = DAG.getBuildVector(
4115 MVT::v2i16, SL,
4116 {DAG.getConstant(0, SL, MVT::i16), LHS->getOperand(0)});
4117 return DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
4118 }
4119
4120 // shl (ext x) => zext (shl x), if shift does not overflow int
4121 if (VT != MVT::i64)
4122 break;
4123 KnownBits Known = DAG.computeKnownBits(X);
4124 unsigned LZ = Known.countMinLeadingZeros();
4125 if (LZ < RHSVal)
4126 break;
4127 EVT XVT = X.getValueType();
4128 SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(CRHS, 0));
4129 return DAG.getZExtOrTrunc(Shl, SL, VT);
4130 }
4131 }
4132 }
4133
4134 if (VT.getScalarType() != MVT::i64)
4135 return SDValue();
4136
4137 // i64 (shl x, C) -> (build_pair 0, (shl x, C - 32))
4138
4139 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
4140 // common case, splitting this into a move and a 32-bit shift is faster and
4141 // the same code size.
4142 KnownBits Known = DAG.computeKnownBits(RHS);
4143
4144 EVT ElementType = VT.getScalarType();
4145 EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(*DAG.getContext());
4146 EVT TargetType = VT.isVector() ? VT.changeVectorElementType(TargetScalarType)
4147 : TargetScalarType;
4148
4149 if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
4150 return SDValue();
4151 SDValue ShiftAmt;
4152
4153 if (CRHS) {
4154 ShiftAmt = DAG.getConstant(RHSVal - TargetScalarType.getSizeInBits(), SL,
4155 TargetType);
4156 } else {
4157 SDValue TruncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS);
4158 const SDValue ShiftMask =
4159 DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType);
4160 // This AND instruction will clamp out of bounds shift values.
4161 // It will also be removed during later instruction selection.
4162 ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, TruncShiftAmt, ShiftMask);
4163 }
4164
4165 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, TargetType, LHS);
4166 SDValue NewShift =
4167 DAG.getNode(ISD::SHL, SL, TargetType, Lo, ShiftAmt, N->getFlags());
4168
4169 const SDValue Zero = DAG.getConstant(0, SL, TargetScalarType);
4170 SDValue Vec;
4171
4172 if (VT.isVector()) {
4173 EVT ConcatType = TargetType.getDoubleNumVectorElementsVT(*DAG.getContext());
4174 unsigned NElts = TargetType.getVectorNumElements();
4176 SmallVector<SDValue, 16> HiAndLoOps(NElts * 2, Zero);
4177
4178 DAG.ExtractVectorElements(NewShift, HiOps, 0, NElts);
4179 for (unsigned I = 0; I != NElts; ++I)
4180 HiAndLoOps[2 * I + 1] = HiOps[I];
4181 Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, ConcatType, HiAndLoOps);
4182 } else {
4183 EVT ConcatType = EVT::getVectorVT(*DAG.getContext(), TargetType, 2);
4184 Vec = DAG.getBuildVector(ConcatType, SL, {Zero, NewShift});
4185 }
4186 return DAG.getNode(ISD::BITCAST, SL, VT, Vec);
4187}
4188
4190 DAGCombinerInfo &DCI) const {
4191 SDValue RHS = N->getOperand(1);
4192 ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
4193 EVT VT = N->getValueType(0);
4194 SDValue LHS = N->getOperand(0);
4195 SelectionDAG &DAG = DCI.DAG;
4196 SDLoc SL(N);
4197
4198 if (VT.getScalarType() != MVT::i64)
4199 return SDValue();
4200
4201 // For C >= 32
4202 // i64 (sra x, C) -> (build_pair (sra hi_32(x), C - 32), sra hi_32(x), 31))
4203
4204 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
4205 // common case, splitting this into a move and a 32-bit shift is faster and
4206 // the same code size.
4207 KnownBits Known = DAG.computeKnownBits(RHS);
4208
4209 EVT ElementType = VT.getScalarType();
4210 EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(*DAG.getContext());
4211 EVT TargetType = VT.isVector() ? VT.changeVectorElementType(TargetScalarType)
4212 : TargetScalarType;
4213
4214 if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
4215 return SDValue();
4216
4217 SDValue ShiftFullAmt =
4218 DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType);
4219 SDValue ShiftAmt;
4220 if (CRHS) {
4221 unsigned RHSVal = CRHS->getZExtValue();
4222 ShiftAmt = DAG.getConstant(RHSVal - TargetScalarType.getSizeInBits(), SL,
4223 TargetType);
4224 } else if (Known.getMinValue().getZExtValue() ==
4225 (ElementType.getSizeInBits() - 1)) {
4226 ShiftAmt = ShiftFullAmt;
4227 } else {
4228 SDValue truncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS);
4229 const SDValue ShiftMask =
4230 DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType);
4231 // This AND instruction will clamp out of bounds shift values.
4232 // It will also be removed during later instruction selection.
4233 ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, truncShiftAmt, ShiftMask);
4234 }
4235
4236 EVT ConcatType;
4237 SDValue Hi;
4238 SDLoc LHSSL(LHS);
4239 // Bitcast LHS into ConcatType so hi-half of source can be extracted into Hi
4240 if (VT.isVector()) {
4241 unsigned NElts = TargetType.getVectorNumElements();
4242 ConcatType = TargetType.getDoubleNumVectorElementsVT(*DAG.getContext());
4243 SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS);
4244 SmallVector<SDValue, 8> HiOps(NElts);
4245 SmallVector<SDValue, 16> HiAndLoOps;
4246
4247 DAG.ExtractVectorElements(SplitLHS, HiAndLoOps, 0, NElts * 2);
4248 for (unsigned I = 0; I != NElts; ++I) {
4249 HiOps[I] = HiAndLoOps[2 * I + 1];
4250 }
4251 Hi = DAG.getNode(ISD::BUILD_VECTOR, LHSSL, TargetType, HiOps);
4252 } else {
4253 const SDValue One = DAG.getConstant(1, LHSSL, TargetScalarType);
4254 ConcatType = EVT::getVectorVT(*DAG.getContext(), TargetType, 2);
4255 SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS);
4256 Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, LHSSL, TargetType, SplitLHS, One);
4257 }
4258
4259 KnownBits KnownLHS = DAG.computeKnownBits(LHS);
4260 SDValue HiShift;
4261 if (KnownLHS.isNegative()) {
4262 HiShift = DAG.getAllOnesConstant(SL, TargetType);
4263 } else {
4264 Hi = DAG.getFreeze(Hi);
4265 HiShift = DAG.getNode(ISD::SRA, SL, TargetType, Hi, ShiftFullAmt);
4266 }
4267 SDValue NewShift =
4268 DAG.getNode(ISD::SRA, SL, TargetType, Hi, ShiftAmt, N->getFlags());
4269
4270 SDValue Vec;
4271 if (VT.isVector()) {
4272 unsigned NElts = TargetType.getVectorNumElements();
4275 SmallVector<SDValue, 16> HiAndLoOps(NElts * 2);
4276
4277 DAG.ExtractVectorElements(HiShift, HiOps, 0, NElts);
4278 DAG.ExtractVectorElements(NewShift, LoOps, 0, NElts);
4279 for (unsigned I = 0; I != NElts; ++I) {
4280 HiAndLoOps[2 * I + 1] = HiOps[I];
4281 HiAndLoOps[2 * I] = LoOps[I];
4282 }
4283 Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, ConcatType, HiAndLoOps);
4284 } else {
4285 Vec = DAG.getBuildVector(ConcatType, SL, {NewShift, HiShift});
4286 }
4287 return DAG.getNode(ISD::BITCAST, SL, VT, Vec);
4288}
4289
4291 DAGCombinerInfo &DCI) const {
4292 SDValue RHS = N->getOperand(1);
4293 ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
4294 EVT VT = N->getValueType(0);
4295 SDValue LHS = N->getOperand(0);
4296 SelectionDAG &DAG = DCI.DAG;
4297 SDLoc SL(N);
4298 unsigned RHSVal;
4299
4300 if (CRHS) {
4301 RHSVal = CRHS->getZExtValue();
4302
4303 // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1)
4304 // this improves the ability to match BFE patterns in isel.
4305 if (LHS.getOpcode() == ISD::AND) {
4306 if (auto *Mask = dyn_cast<ConstantSDNode>(LHS.getOperand(1))) {
4307 unsigned MaskIdx, MaskLen;
4308 if (Mask->getAPIntValue().isShiftedMask(MaskIdx, MaskLen) &&
4309 MaskIdx == RHSVal) {
4310 return DAG.getNode(ISD::AND, SL, VT,
4311 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0),
4312 N->getOperand(1)),
4313 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(1),
4314 N->getOperand(1)));
4315 }
4316 }
4317 }
4318 }
4319
4320 if (VT.getScalarType() != MVT::i64)
4321 return SDValue();
4322
4323 // for C >= 32
4324 // i64 (srl x, C) -> (build_pair (srl hi_32(x), C - 32), 0)
4325
4326 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
4327 // common case, splitting this into a move and a 32-bit shift is faster and
4328 // the same code size.
4329 KnownBits Known = DAG.computeKnownBits(RHS);
4330
4331 EVT ElementType = VT.getScalarType();
4332 EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(*DAG.getContext());
4333 EVT TargetType = VT.isVector() ? VT.changeVectorElementType(TargetScalarType)
4334 : TargetScalarType;
4335
4336 if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
4337 return SDValue();
4338
4339 SDValue ShiftAmt;
4340 if (CRHS) {
4341 ShiftAmt = DAG.getConstant(RHSVal - TargetScalarType.getSizeInBits(), SL,
4342 TargetType);
4343 } else {
4344 SDValue TruncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS);
4345 const SDValue ShiftMask =
4346 DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType);
4347 // This AND instruction will clamp out of bounds shift values.
4348 // It will also be removed during later instruction selection.
4349 ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, TruncShiftAmt, ShiftMask);
4350 }
4351
4352 const SDValue Zero = DAG.getConstant(0, SL, TargetScalarType);
4353 EVT ConcatType;
4354 SDValue Hi;
4355 SDLoc LHSSL(LHS);
4356 // Bitcast LHS into ConcatType so hi-half of source can be extracted into Hi
4357 if (VT.isVector()) {
4358 unsigned NElts = TargetType.getVectorNumElements();
4359 ConcatType = TargetType.getDoubleNumVectorElementsVT(*DAG.getContext());
4360 SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS);
4361 SmallVector<SDValue, 8> HiOps(NElts);
4362 SmallVector<SDValue, 16> HiAndLoOps;
4363
4364 DAG.ExtractVectorElements(SplitLHS, HiAndLoOps, /*Start=*/0, NElts * 2);
4365 for (unsigned I = 0; I != NElts; ++I)
4366 HiOps[I] = HiAndLoOps[2 * I + 1];
4367 Hi = DAG.getNode(ISD::BUILD_VECTOR, LHSSL, TargetType, HiOps);
4368 } else {
4369 const SDValue One = DAG.getConstant(1, LHSSL, TargetScalarType);
4370 ConcatType = EVT::getVectorVT(*DAG.getContext(), TargetType, 2);
4371 SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS);
4372 Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, LHSSL, TargetType, SplitLHS, One);
4373 }
4374
4375 SDValue NewShift =
4376 DAG.getNode(ISD::SRL, SL, TargetType, Hi, ShiftAmt, N->getFlags());
4377
4378 SDValue Vec;
4379 if (VT.isVector()) {
4380 unsigned NElts = TargetType.getVectorNumElements();
4382 SmallVector<SDValue, 16> HiAndLoOps(NElts * 2, Zero);
4383
4384 DAG.ExtractVectorElements(NewShift, LoOps, 0, NElts);
4385 for (unsigned I = 0; I != NElts; ++I)
4386 HiAndLoOps[2 * I] = LoOps[I];
4387 Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, ConcatType, HiAndLoOps);
4388 } else {
4389 Vec = DAG.getBuildVector(ConcatType, SL, {NewShift, Zero});
4390 }
4391 return DAG.getNode(ISD::BITCAST, SL, VT, Vec);
4392}
4393
4395 SDNode *N, DAGCombinerInfo &DCI) const {
4396 SDLoc SL(N);
4397 SelectionDAG &DAG = DCI.DAG;
4398 EVT VT = N->getValueType(0);
4399 SDValue Src = N->getOperand(0);
4400
4401 // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x)
4402 if (Src.getOpcode() == ISD::BITCAST && !VT.isVector()) {
4403 SDValue Vec = Src.getOperand(0);
4404 if (Vec.getOpcode() == ISD::BUILD_VECTOR) {
4405 SDValue Elt0 = Vec.getOperand(0);
4406 EVT EltVT = Elt0.getValueType();
4407 if (VT.getFixedSizeInBits() <= EltVT.getFixedSizeInBits()) {
4408 if (EltVT.isFloatingPoint()) {
4409 Elt0 = DAG.getNode(ISD::BITCAST, SL,
4410 EltVT.changeTypeToInteger(), Elt0);
4411 }
4412
4413 return DAG.getNode(ISD::TRUNCATE, SL, VT, Elt0);
4414 }
4415 }
4416 }
4417
4418 // Equivalent of above for accessing the high element of a vector as an
4419 // integer operation.
4420 // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)
4421 if (Src.getOpcode() == ISD::SRL && !VT.isVector()) {
4422 if (auto *K = isConstOrConstSplat(Src.getOperand(1))) {
4423 SDValue BV = stripBitcast(Src.getOperand(0));
4424 if (BV.getOpcode() == ISD::BUILD_VECTOR) {
4425 EVT SrcEltVT = BV.getOperand(0).getValueType();
4426 unsigned SrcEltSize = SrcEltVT.getSizeInBits();
4427 unsigned BitIndex = K->getZExtValue();
4428 unsigned PartIndex = BitIndex / SrcEltSize;
4429
4430 if (PartIndex * SrcEltSize == BitIndex &&
4431 PartIndex < BV.getNumOperands()) {
4432 if (SrcEltVT.getSizeInBits() == VT.getSizeInBits()) {
4433 SDValue SrcElt =
4434 DAG.getNode(ISD::BITCAST, SL, SrcEltVT.changeTypeToInteger(),
4435 BV.getOperand(PartIndex));
4436 return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt);
4437 }
4438 }
4439 }
4440 }
4441 }
4442
4443 // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.
4444 //
4445 // i16 (trunc (srl i64:x, K)), K <= 16 ->
4446 // i16 (trunc (srl (i32 (trunc x), K)))
4447 if (VT.getScalarSizeInBits() < 32) {
4448 EVT SrcVT = Src.getValueType();
4449 if (SrcVT.getScalarSizeInBits() > 32 &&
4450 (Src.getOpcode() == ISD::SRL ||
4451 Src.getOpcode() == ISD::SRA ||
4452 Src.getOpcode() == ISD::SHL)) {
4453 SDValue Amt = Src.getOperand(1);
4454 KnownBits Known = DAG.computeKnownBits(Amt);
4455
4456 // - For left shifts, do the transform as long as the shift
4457 // amount is still legal for i32, so when ShiftAmt < 32 (<= 31)
4458 // - For right shift, do it if ShiftAmt <= (32 - Size) to avoid
4459 // losing information stored in the high bits when truncating.
4460 const unsigned MaxCstSize =
4461 (Src.getOpcode() == ISD::SHL) ? 31 : (32 - VT.getScalarSizeInBits());
4462 if (Known.getMaxValue().ule(MaxCstSize)) {
4463 EVT MidVT = VT.isVector() ?
4464 EVT::getVectorVT(*DAG.getContext(), MVT::i32,
4465 VT.getVectorNumElements()) : MVT::i32;
4466
4467 EVT NewShiftVT = getShiftAmountTy(MidVT, DAG.getDataLayout());
4468 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MidVT,
4469 Src.getOperand(0));
4470 DCI.AddToWorklist(Trunc.getNode());
4471
4472 if (Amt.getValueType() != NewShiftVT) {
4473 Amt = DAG.getZExtOrTrunc(Amt, SL, NewShiftVT);
4474 DCI.AddToWorklist(Amt.getNode());
4475 }
4476
4477 SDValue ShrunkShift = DAG.getNode(Src.getOpcode(), SL, MidVT,
4478 Trunc, Amt);
4479 return DAG.getNode(ISD::TRUNCATE, SL, VT, ShrunkShift);
4480 }
4481 }
4482 }
4483
4484 return SDValue();
4485}
4486
4487// We need to specifically handle i64 mul here to avoid unnecessary conversion
4488// instructions. If we only match on the legalized i64 mul expansion,
4489// SimplifyDemandedBits will be unable to remove them because there will be
4490// multiple uses due to the separate mul + mulh[su].
4491static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
4492 SDValue N0, SDValue N1, unsigned Size, bool Signed) {
4493 if (Size <= 32) {
4494 unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
4495 return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1);
4496 }
4497
4498 unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
4499 unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
4500
4501 SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1);
4502 SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1);
4503
4504 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, MulLo, MulHi);
4505}
4506
4507/// If \p V is an add of a constant 1, returns the other operand. Otherwise
4508/// return SDValue().
4509static SDValue getAddOneOp(const SDNode *V) {
4510 if (V->getOpcode() != ISD::ADD)
4511 return SDValue();
4512
4513 return isOneConstant(V->getOperand(1)) ? V->getOperand(0) : SDValue();
4514}
4515
4517 DAGCombinerInfo &DCI) const {
4518 assert(N->getOpcode() == ISD::MUL);
4519 EVT VT = N->getValueType(0);
4520
4521 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4522 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4523 // unnecessarily). isDivergent() is used as an approximation of whether the
4524 // value is in an SGPR.
4525 if (!N->isDivergent())
4526 return SDValue();
4527
4528 unsigned Size = VT.getSizeInBits();
4529 if (VT.isVector() || Size > 64)
4530 return SDValue();
4531
4532 SelectionDAG &DAG = DCI.DAG;
4533 SDLoc DL(N);
4534
4535 SDValue N0 = N->getOperand(0);
4536 SDValue N1 = N->getOperand(1);
4537
4538 // Undo InstCombine canonicalize X * (Y + 1) -> X * Y + X to enable mad
4539 // matching.
4540
4541 // mul x, (add y, 1) -> add (mul x, y), x
4542 auto IsFoldableAdd = [](SDValue V) -> SDValue {
4543 SDValue AddOp = getAddOneOp(V.getNode());
4544 if (!AddOp)
4545 return SDValue();
4546
4547 if (V.hasOneUse() || all_of(V->users(), [](const SDNode *U) -> bool {
4548 return U->getOpcode() == ISD::MUL;
4549 }))
4550 return AddOp;
4551
4552 return SDValue();
4553 };
4554
4555 // FIXME: The selection pattern is not properly checking for commuted
4556 // operands, so we have to place the mul in the LHS
4557 if (SDValue MulOper = IsFoldableAdd(N0)) {
4558 SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N1, MulOper);
4559 return DAG.getNode(ISD::ADD, DL, VT, MulVal, N1);
4560 }
4561
4562 if (SDValue MulOper = IsFoldableAdd(N1)) {
4563 SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N0, MulOper);
4564 return DAG.getNode(ISD::ADD, DL, VT, MulVal, N0);
4565 }
4566
4567 // There are i16 integer mul/mad.
4568 if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16))
4569 return SDValue();
4570
4571 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
4572 // in the source into any_extends if the result of the mul is truncated. Since
4573 // we can assume the high bits are whatever we want, use the underlying value
4574 // to avoid the unknown high bits from interfering.
4575 if (N0.getOpcode() == ISD::ANY_EXTEND)
4576 N0 = N0.getOperand(0);
4577
4578 if (N1.getOpcode() == ISD::ANY_EXTEND)
4579 N1 = N1.getOperand(0);
4580
4581 SDValue Mul;
4582
4583 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
4584 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4585 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4586 Mul = getMul24(DAG, DL, N0, N1, Size, false);
4587 } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
4588 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4589 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4590 Mul = getMul24(DAG, DL, N0, N1, Size, true);
4591 } else {
4592 return SDValue();
4593 }
4594
4595 // We need to use sext even for MUL_U24, because MUL_U24 is used
4596 // for signed multiply of 8 and 16-bit types.
4597 return DAG.getSExtOrTrunc(Mul, DL, VT);
4598}
4599
4600SDValue
4602 DAGCombinerInfo &DCI) const {
4603 if (N->getValueType(0) != MVT::i32)
4604 return SDValue();
4605
4606 SelectionDAG &DAG = DCI.DAG;
4607 SDLoc DL(N);
4608
4609 bool Signed = N->getOpcode() == ISD::SMUL_LOHI;
4610 SDValue N0 = N->getOperand(0);
4611 SDValue N1 = N->getOperand(1);
4612
4613 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
4614 // in the source into any_extends if the result of the mul is truncated. Since
4615 // we can assume the high bits are whatever we want, use the underlying value
4616 // to avoid the unknown high bits from interfering.
4617 if (N0.getOpcode() == ISD::ANY_EXTEND)
4618 N0 = N0.getOperand(0);
4619 if (N1.getOpcode() == ISD::ANY_EXTEND)
4620 N1 = N1.getOperand(0);
4621
4622 // Try to use two fast 24-bit multiplies (one for each half of the result)
4623 // instead of one slow extending multiply.
4624 unsigned LoOpcode = 0;
4625 unsigned HiOpcode = 0;
4626 if (Signed) {
4627 if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
4628 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4629 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4630 LoOpcode = AMDGPUISD::MUL_I24;
4631 HiOpcode = AMDGPUISD::MULHI_I24;
4632 }
4633 } else {
4634 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
4635 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4636 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4637 LoOpcode = AMDGPUISD::MUL_U24;
4638 HiOpcode = AMDGPUISD::MULHI_U24;
4639 }
4640 }
4641 if (!LoOpcode)
4642 return SDValue();
4643
4644 SDValue Lo = DAG.getNode(LoOpcode, DL, MVT::i32, N0, N1);
4645 SDValue Hi = DAG.getNode(HiOpcode, DL, MVT::i32, N0, N1);
4646 DCI.CombineTo(N, Lo, Hi);
4647 return SDValue(N, 0);
4648}
4649
4651 DAGCombinerInfo &DCI) const {
4652 EVT VT = N->getValueType(0);
4653
4654 if (!Subtarget->hasMulI24() || VT.isVector())
4655 return SDValue();
4656
4657 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4658 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4659 // unnecessarily). isDivergent() is used as an approximation of whether the
4660 // value is in an SGPR.
4661 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
4662 // valu op anyway)
4663 if (Subtarget->hasSMulHi() && !N->isDivergent())
4664 return SDValue();
4665
4666 SelectionDAG &DAG = DCI.DAG;
4667 SDLoc DL(N);
4668
4669 SDValue N0 = N->getOperand(0);
4670 SDValue N1 = N->getOperand(1);
4671
4672 if (!isI24(N0, DAG) || !isI24(N1, DAG))
4673 return SDValue();
4674
4675 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4676 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4677
4678 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1);
4679 DCI.AddToWorklist(Mulhi.getNode());
4680 return DAG.getSExtOrTrunc(Mulhi, DL, VT);
4681}
4682
4684 DAGCombinerInfo &DCI) const {
4685 EVT VT = N->getValueType(0);
4686
4687 if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32)
4688 return SDValue();
4689
4690 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4691 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4692 // unnecessarily). isDivergent() is used as an approximation of whether the
4693 // value is in an SGPR.
4694 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
4695 // valu op anyway)
4696 if (Subtarget->hasSMulHi() && !N->isDivergent())
4697 return SDValue();
4698
4699 SelectionDAG &DAG = DCI.DAG;
4700 SDLoc DL(N);
4701
4702 SDValue N0 = N->getOperand(0);
4703 SDValue N1 = N->getOperand(1);
4704
4705 if (!isU24(N0, DAG) || !isU24(N1, DAG))
4706 return SDValue();
4707
4708 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4709 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4710
4711 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1);
4712 DCI.AddToWorklist(Mulhi.getNode());
4713 return DAG.getZExtOrTrunc(Mulhi, DL, VT);
4714}
4715
4716SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG,
4717 SDValue Op,
4718 const SDLoc &DL,
4719 unsigned Opc) const {
4720 EVT VT = Op.getValueType();
4721 EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT);
4722 if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() &&
4723 LegalVT != MVT::i16))
4724 return SDValue();
4725
4726 if (VT != MVT::i32)
4727 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op);
4728
4729 SDValue FFBX = DAG.getNode(Opc, DL, MVT::i32, Op);
4730 if (VT != MVT::i32)
4731 FFBX = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBX);
4732
4733 return FFBX;
4734}
4735
4736// The native instructions return -1 on 0 input. Optimize out a select that
4737// produces -1 on 0.
4738//
4739// TODO: If zero is not undef, we could also do this if the output is compared
4740// against the bitwidth.
4741//
4742// TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
4744 SDValue LHS, SDValue RHS,
4745 DAGCombinerInfo &DCI) const {
4746 if (!isNullConstant(Cond.getOperand(1)))
4747 return SDValue();
4748
4749 SelectionDAG &DAG = DCI.DAG;
4750 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
4751 SDValue CmpLHS = Cond.getOperand(0);
4752
4753 // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
4754 // select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x
4755 if (CCOpcode == ISD::SETEQ &&
4756 (isCtlzOpc(RHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&
4757 RHS.getOperand(0) == CmpLHS && isAllOnesConstant(LHS)) {
4758 unsigned Opc =
4760 return getFFBX_U32(DAG, CmpLHS, SL, Opc);
4761 }
4762
4763 // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
4764 // select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x
4765 if (CCOpcode == ISD::SETNE &&
4766 (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(LHS.getOpcode())) &&
4767 LHS.getOperand(0) == CmpLHS && isAllOnesConstant(RHS)) {
4768 unsigned Opc =
4770
4771 return getFFBX_U32(DAG, CmpLHS, SL, Opc);
4772 }
4773
4774 return SDValue();
4775}
4776
4778 unsigned Op,
4779 const SDLoc &SL,
4780 SDValue Cond,
4781 SDValue N1,
4782 SDValue N2) {
4783 SelectionDAG &DAG = DCI.DAG;
4784 EVT VT = N1.getValueType();
4785
4786 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond,
4787 N1.getOperand(0), N2.getOperand(0));
4788 DCI.AddToWorklist(NewSelect.getNode());
4789 return DAG.getNode(Op, SL, VT, NewSelect);
4790}
4791
4792// Pull a free FP operation out of a select so it may fold into uses.
4793//
4794// select c, (fneg x), (fneg y) -> fneg (select c, x, y)
4795// select c, (fneg x), k -> fneg (select c, x, (fneg k))
4796//
4797// select c, (fabs x), (fabs y) -> fabs (select c, x, y)
4798// select c, (fabs x), +k -> fabs (select c, x, k)
4799SDValue
4801 SDValue N) const {
4802 SelectionDAG &DAG = DCI.DAG;
4803 SDValue Cond = N.getOperand(0);
4804 SDValue LHS = N.getOperand(1);
4805 SDValue RHS = N.getOperand(2);
4806
4807 EVT VT = N.getValueType();
4808 if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||
4809 (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
4811 return SDValue();
4812
4813 return distributeOpThroughSelect(DCI, LHS.getOpcode(),
4814 SDLoc(N), Cond, LHS, RHS);
4815 }
4816
4817 bool Inv = false;
4818 if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) {
4819 std::swap(LHS, RHS);
4820 Inv = true;
4821 }
4822
4823 // TODO: Support vector constants.
4824 ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
4825 if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS &&
4826 !selectSupportsSourceMods(N.getNode())) {
4827 SDLoc SL(N);
4828 // If one side is an fneg/fabs and the other is a constant, we can push the
4829 // fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
4830 SDValue NewLHS = LHS.getOperand(0);
4831 SDValue NewRHS = RHS;
4832
4833 // Careful: if the neg can be folded up, don't try to pull it back down.
4834 bool ShouldFoldNeg = true;
4835
4836 if (NewLHS.hasOneUse()) {
4837 unsigned Opc = NewLHS.getOpcode();
4838 if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(NewLHS.getNode()))
4839 ShouldFoldNeg = false;
4840 if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
4841 ShouldFoldNeg = false;
4842 }
4843
4844 if (ShouldFoldNeg) {
4845 if (LHS.getOpcode() == ISD::FABS && CRHS->isNegative())
4846 return SDValue();
4847
4848 // We're going to be forced to use a source modifier anyway, there's no
4849 // point to pulling the negate out unless we can get a size reduction by
4850 // negating the constant.
4851 //
4852 // TODO: Generalize to use getCheaperNegatedExpression which doesn't know
4853 // about cheaper constants.
4854 if (NewLHS.getOpcode() == ISD::FABS &&
4856 return SDValue();
4857
4859 return SDValue();
4860
4861 if (LHS.getOpcode() == ISD::FNEG)
4862 NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4863
4864 if (Inv)
4865 std::swap(NewLHS, NewRHS);
4866
4867 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
4868 Cond, NewLHS, NewRHS);
4869 DCI.AddToWorklist(NewSelect.getNode());
4870 return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
4871 }
4872 }
4873
4874 return SDValue();
4875}
4876
4878 DAGCombinerInfo &DCI) const {
4879 if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
4880 return Folded;
4881
4882 SDValue Cond = N->getOperand(0);
4883 if (Cond.getOpcode() != ISD::SETCC)
4884 return SDValue();
4885
4886 EVT VT = N->getValueType(0);
4887 SDValue LHS = Cond.getOperand(0);
4888 SDValue RHS = Cond.getOperand(1);
4889 SDValue CC = Cond.getOperand(2);
4890
4891 SDValue True = N->getOperand(1);
4892 SDValue False = N->getOperand(2);
4893
4894 if (Cond.hasOneUse()) { // TODO: Look for multiple select uses.
4895 SelectionDAG &DAG = DCI.DAG;
4896 if (DAG.isConstantValueOfAnyType(True) &&
4897 !DAG.isConstantValueOfAnyType(False)) {
4898 // Swap cmp + select pair to move constant to false input.
4899 // This will allow using VOPC cndmasks more often.
4900 // select (setcc x, y), k, x -> select (setccinv x, y), x, k
4901
4902 SDLoc SL(N);
4903 ISD::CondCode NewCC =
4904 getSetCCInverse(cast<CondCodeSDNode>(CC)->get(), LHS.getValueType());
4905
4906 SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC);
4907 return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True);
4908 }
4909
4910 if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) {
4912 = combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
4913 // Revisit this node so we can catch min3/max3/med3 patterns.
4914 //DCI.AddToWorklist(MinMax.getNode());
4915 return MinMax;
4916 }
4917 }
4918
4919 // There's no reason to not do this if the condition has other uses.
4920 return performCtlz_CttzCombine(SDLoc(N), Cond, True, False, DCI);
4921}
4922
4923static bool isInv2Pi(const APFloat &APF) {
4924 static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118));
4925 static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983));
4926 static const APFloat KF64(APFloat::IEEEdouble(), APInt(64, 0x3fc45f306dc9c882));
4927
4928 return APF.bitwiseIsEqual(KF16) ||
4929 APF.bitwiseIsEqual(KF32) ||
4930 APF.bitwiseIsEqual(KF64);
4931}
4932
4933// 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an
4934// additional cost to negate them.
4937 if (C->isZero())
4938 return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
4939
4940 if (Subtarget->hasInv2PiInlineImm() && isInv2Pi(C->getValueAPF()))
4941 return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
4942
4944}
4945
4949 return false;
4950}
4951
4955 return false;
4956}
4957
4958static unsigned inverseMinMax(unsigned Opc) {
4959 switch (Opc) {
4960 case ISD::FMAXNUM:
4961 return ISD::FMINNUM;
4962 case ISD::FMINNUM:
4963 return ISD::FMAXNUM;
4964 case ISD::FMAXNUM_IEEE:
4965 return ISD::FMINNUM_IEEE;
4966 case ISD::FMINNUM_IEEE:
4967 return ISD::FMAXNUM_IEEE;
4968 case ISD::FMAXIMUM:
4969 return ISD::FMINIMUM;
4970 case ISD::FMINIMUM:
4971 return ISD::FMAXIMUM;
4972 case ISD::FMAXIMUMNUM:
4973 return ISD::FMINIMUMNUM;
4974 case ISD::FMINIMUMNUM:
4975 return ISD::FMAXIMUMNUM;
4980 default:
4981 llvm_unreachable("invalid min/max opcode");
4982 }
4983}
4984
4985/// \return true if it's profitable to try to push an fneg into its source
4986/// instruction.
4988 // If the input has multiple uses and we can either fold the negate down, or
4989 // the other uses cannot, give up. This both prevents unprofitable
4990 // transformations and infinite loops: we won't repeatedly try to fold around
4991 // a negate that has no 'good' form.
4992 if (N0.hasOneUse()) {
4993 // This may be able to fold into the source, but at a code size cost. Don't
4994 // fold if the fold into the user is free.
4995 if (allUsesHaveSourceMods(N, 0))
4996 return false;
4997 } else {
4998 if (fnegFoldsIntoOp(N0.getNode()) &&
5000 return false;
5001 }
5002
5003 return true;
5004}
5005
5007 DAGCombinerInfo &DCI) const {
5008 SelectionDAG &DAG = DCI.DAG;
5009 SDValue N0 = N->getOperand(0);
5010 EVT VT = N->getValueType(0);
5011
5012 unsigned Opc = N0.getOpcode();
5013
5014 if (!shouldFoldFNegIntoSrc(N, N0))
5015 return SDValue();
5016
5017 SDLoc SL(N);
5018 switch (Opc) {
5019 case ISD::FADD: {
5020 if (!mayIgnoreSignedZero(N0))
5021 return SDValue();
5022
5023 // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
5024 SDValue LHS = N0.getOperand(0);
5025 SDValue RHS = N0.getOperand(1);
5026
5027 if (LHS.getOpcode() != ISD::FNEG)
5028 LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
5029 else
5030 LHS = LHS.getOperand(0);
5031
5032 if (RHS.getOpcode() != ISD::FNEG)
5033 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
5034 else
5035 RHS = RHS.getOperand(0);
5036
5037 SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS, N0->getFlags());
5038 if (Res.getOpcode() != ISD::FADD)
5039 return SDValue(); // Op got folded away.
5040 if (!N0.hasOneUse())
5041 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
5042 return Res;
5043 }
5044 case ISD::FMUL:
5046 // (fneg (fmul x, y)) -> (fmul x, (fneg y))
5047 // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y))
5048 SDValue LHS = N0.getOperand(0);
5049 SDValue RHS = N0.getOperand(1);
5050
5051 if (LHS.getOpcode() == ISD::FNEG)
5052 LHS = LHS.getOperand(0);
5053 else if (RHS.getOpcode() == ISD::FNEG)
5054 RHS = RHS.getOperand(0);
5055 else
5056 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
5057
5058 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS, N0->getFlags());
5059 if (Res.getOpcode() != Opc)
5060 return SDValue(); // Op got folded away.
5061 if (!N0.hasOneUse())
5062 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
5063 return Res;
5064 }
5065 case ISD::FMA:
5066 case ISD::FMAD: {
5067 // TODO: handle llvm.amdgcn.fma.legacy
5068 if (!mayIgnoreSignedZero(N0))
5069 return SDValue();
5070
5071 // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))
5072 SDValue LHS = N0.getOperand(0);
5073 SDValue MHS = N0.getOperand(1);
5074 SDValue RHS = N0.getOperand(2);
5075
5076 if (LHS.getOpcode() == ISD::FNEG)
5077 LHS = LHS.getOperand(0);
5078 else if (MHS.getOpcode() == ISD::FNEG)
5079 MHS = MHS.getOperand(0);
5080 else
5081 MHS = DAG.getNode(ISD::FNEG, SL, VT, MHS);
5082
5083 if (RHS.getOpcode() != ISD::FNEG)
5084 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
5085 else
5086 RHS = RHS.getOperand(0);
5087
5088 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS);
5089 if (Res.getOpcode() != Opc)
5090 return SDValue(); // Op got folded away.
5091 if (!N0.hasOneUse())
5092 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
5093 return Res;
5094 }
5095 case ISD::FMAXNUM:
5096 case ISD::FMINNUM:
5097 case ISD::FMAXNUM_IEEE:
5098 case ISD::FMINNUM_IEEE:
5099 case ISD::FMINIMUM:
5100 case ISD::FMAXIMUM:
5101 case ISD::FMINIMUMNUM:
5102 case ISD::FMAXIMUMNUM:
5105 // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)
5106 // fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y)
5107 // fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y)
5108 // fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y)
5109
5110 SDValue LHS = N0.getOperand(0);
5111 SDValue RHS = N0.getOperand(1);
5112
5113 // 0 doesn't have a negated inline immediate.
5114 // TODO: This constant check should be generalized to other operations.
5116 return SDValue();
5117
5118 SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
5119 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
5120 unsigned Opposite = inverseMinMax(Opc);
5121
5122 SDValue Res = DAG.getNode(Opposite, SL, VT, NegLHS, NegRHS, N0->getFlags());
5123 if (Res.getOpcode() != Opposite)
5124 return SDValue(); // Op got folded away.
5125 if (!N0.hasOneUse())
5126 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
5127 return Res;
5128 }
5129 case AMDGPUISD::FMED3: {
5130 SDValue Ops[3];
5131 for (unsigned I = 0; I < 3; ++I)
5132 Ops[I] = DAG.getNode(ISD::FNEG, SL, VT, N0->getOperand(I), N0->getFlags());
5133
5134 SDValue Res = DAG.getNode(AMDGPUISD::FMED3, SL, VT, Ops, N0->getFlags());
5135 if (Res.getOpcode() != AMDGPUISD::FMED3)
5136 return SDValue(); // Op got folded away.
5137
5138 if (!N0.hasOneUse()) {
5139 SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Res);
5140 DAG.ReplaceAllUsesWith(N0, Neg);
5141
5142 for (SDNode *U : Neg->users())
5143 DCI.AddToWorklist(U);
5144 }
5145
5146 return Res;
5147 }
5148 case ISD::FP_EXTEND:
5149 case ISD::FTRUNC:
5150 case ISD::FRINT:
5151 case ISD::FNEARBYINT: // XXX - Should fround be handled?
5152 case ISD::FROUNDEVEN:
5153 case ISD::FSIN:
5154 case ISD::FCANONICALIZE:
5155 case AMDGPUISD::RCP:
5158 case AMDGPUISD::SIN_HW: {
5159 SDValue CvtSrc = N0.getOperand(0);
5160 if (CvtSrc.getOpcode() == ISD::FNEG) {
5161 // (fneg (fp_extend (fneg x))) -> (fp_extend x)
5162 // (fneg (rcp (fneg x))) -> (rcp x)
5163 return DAG.getNode(Opc, SL, VT, CvtSrc.getOperand(0));
5164 }
5165
5166 if (!N0.hasOneUse())
5167 return SDValue();
5168
5169 // (fneg (fp_extend x)) -> (fp_extend (fneg x))
5170 // (fneg (rcp x)) -> (rcp (fneg x))
5171 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
5172 return DAG.getNode(Opc, SL, VT, Neg, N0->getFlags());
5173 }
5174 case ISD::FP_ROUND: {
5175 SDValue CvtSrc = N0.getOperand(0);
5176
5177 if (CvtSrc.getOpcode() == ISD::FNEG) {
5178 // (fneg (fp_round (fneg x))) -> (fp_round x)
5179 return DAG.getNode(ISD::FP_ROUND, SL, VT,
5180 CvtSrc.getOperand(0), N0.getOperand(1));
5181 }
5182
5183 if (!N0.hasOneUse())
5184 return SDValue();
5185
5186 // (fneg (fp_round x)) -> (fp_round (fneg x))
5187 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
5188 return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1));
5189 }
5190 case ISD::FP16_TO_FP: {
5191 // v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal
5192 // f16, but legalization of f16 fneg ends up pulling it out of the source.
5193 // Put the fneg back as a legal source operation that can be matched later.
5194 SDLoc SL(N);
5195
5196 SDValue Src = N0.getOperand(0);
5197 EVT SrcVT = Src.getValueType();
5198
5199 // fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000)
5200 SDValue IntFNeg = DAG.getNode(ISD::XOR, SL, SrcVT, Src,
5201 DAG.getConstant(0x8000, SL, SrcVT));
5202 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFNeg);
5203 }
5204 case ISD::SELECT: {
5205 // fneg (select c, a, b) -> select c, (fneg a), (fneg b)
5206 // TODO: Invert conditions of foldFreeOpFromSelect
5207 return SDValue();
5208 }
5209 case ISD::BITCAST: {
5210 SDLoc SL(N);
5211 SDValue BCSrc = N0.getOperand(0);
5212 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
5213 SDValue HighBits = BCSrc.getOperand(BCSrc.getNumOperands() - 1);
5214 if (HighBits.getValueType().getSizeInBits() != 32 ||
5215 !fnegFoldsIntoOp(HighBits.getNode()))
5216 return SDValue();
5217
5218 // f64 fneg only really needs to operate on the high half of of the
5219 // register, so try to force it to an f32 operation to help make use of
5220 // source modifiers.
5221 //
5222 //
5223 // fneg (f64 (bitcast (build_vector x, y))) ->
5224 // f64 (bitcast (build_vector (bitcast i32:x to f32),
5225 // (fneg (bitcast i32:y to f32)))
5226
5227 SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::f32, HighBits);
5228 SDValue NegHi = DAG.getNode(ISD::FNEG, SL, MVT::f32, CastHi);
5229 SDValue CastBack =
5230 DAG.getNode(ISD::BITCAST, SL, HighBits.getValueType(), NegHi);
5231
5232 SmallVector<SDValue, 8> Ops(BCSrc->ops());
5233 Ops.back() = CastBack;
5234 DCI.AddToWorklist(NegHi.getNode());
5235 SDValue Build =
5236 DAG.getNode(ISD::BUILD_VECTOR, SL, BCSrc.getValueType(), Ops);
5237 SDValue Result = DAG.getNode(ISD::BITCAST, SL, VT, Build);
5238
5239 if (!N0.hasOneUse())
5240 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Result));
5241 return Result;
5242 }
5243
5244 if (BCSrc.getOpcode() == ISD::SELECT && VT == MVT::f32 &&
5245 BCSrc.hasOneUse()) {
5246 // fneg (bitcast (f32 (select cond, i32:lhs, i32:rhs))) ->
5247 // select cond, (bitcast i32:lhs to f32), (bitcast i32:rhs to f32)
5248
5249 // TODO: Cast back result for multiple uses is beneficial in some cases.
5250
5251 SDValue LHS =
5252 DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(1));
5253 SDValue RHS =
5254 DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(2));
5255
5256 SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, LHS);
5257 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHS);
5258
5259 return DAG.getNode(ISD::SELECT, SL, MVT::f32, BCSrc.getOperand(0), NegLHS,
5260 NegRHS);
5261 }
5262
5263 return SDValue();
5264 }
5265 default:
5266 return SDValue();
5267 }
5268}
5269
5271 DAGCombinerInfo &DCI) const {
5272 SelectionDAG &DAG = DCI.DAG;
5273 SDValue N0 = N->getOperand(0);
5274
5275 if (!N0.hasOneUse())
5276 return SDValue();
5277
5278 switch (N0.getOpcode()) {
5279 case ISD::FP16_TO_FP: {
5280 assert(!Subtarget->has16BitInsts() && "should only see if f16 is illegal");
5281 SDLoc SL(N);
5282 SDValue Src = N0.getOperand(0);
5283 EVT SrcVT = Src.getValueType();
5284
5285 // fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff)
5286 SDValue IntFAbs = DAG.getNode(ISD::AND, SL, SrcVT, Src,
5287 DAG.getConstant(0x7fff, SL, SrcVT));
5288 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFAbs);
5289 }
5290 default:
5291 return SDValue();
5292 }
5293}
5294
5296 DAGCombinerInfo &DCI) const {
5297 const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
5298 if (!CFP)
5299 return SDValue();
5300
5301 // XXX - Should this flush denormals?
5302 const APFloat &Val = CFP->getValueAPF();
5303 APFloat One(Val.getSemantics(), "1.0");
5304 return DCI.DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0));
5305}
5306
5308 DAGCombinerInfo &DCI) const {
5309 SelectionDAG &DAG = DCI.DAG;
5310 SDLoc DL(N);
5311
5312 switch(N->getOpcode()) {
5313 default:
5314 break;
5315 case ISD::BITCAST: {
5316 EVT DestVT = N->getValueType(0);
5317
5318 // Push casts through vector builds. This helps avoid emitting a large
5319 // number of copies when materializing floating point vector constants.
5320 //
5321 // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) =>
5322 // vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y))
5323 if (DestVT.isVector()) {
5324 SDValue Src = N->getOperand(0);
5325 if (Src.getOpcode() == ISD::BUILD_VECTOR &&
5328 EVT SrcVT = Src.getValueType();
5329 unsigned NElts = DestVT.getVectorNumElements();
5330
5331 if (SrcVT.getVectorNumElements() == NElts) {
5332 EVT DestEltVT = DestVT.getVectorElementType();
5333
5334 SmallVector<SDValue, 8> CastedElts;
5335 SDLoc SL(N);
5336 for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) {
5337 SDValue Elt = Src.getOperand(I);
5338 CastedElts.push_back(DAG.getNode(ISD::BITCAST, DL, DestEltVT, Elt));
5339 }
5340
5341 return DAG.getBuildVector(DestVT, SL, CastedElts);
5342 }
5343 }
5344 }
5345
5346 if (DestVT.getSizeInBits() != 64 || !DestVT.isVector())
5347 break;
5348
5349 // Fold bitcasts of constants.
5350 //
5351 // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k)
5352 // TODO: Generalize and move to DAGCombiner
5353 SDValue Src = N->getOperand(0);
5354 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src)) {
5355 SDLoc SL(N);
5356 uint64_t CVal = C->getZExtValue();
5357 SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
5358 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
5359 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
5360 return DAG.getNode(ISD::BITCAST, SL, DestVT, BV);
5361 }
5362
5363 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Src)) {
5364 const APInt &Val = C->getValueAPF().bitcastToAPInt();
5365 SDLoc SL(N);
5366 uint64_t CVal = Val.getZExtValue();
5367 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
5368 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
5369 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
5370
5371 return DAG.getNode(ISD::BITCAST, SL, DestVT, Vec);
5372 }
5373
5374 break;
5375 }
5376 case ISD::SHL:
5377 case ISD::SRA:
5378 case ISD::SRL: {
5379 // Range metadata can be invalidated when loads are converted to legal types
5380 // (e.g. v2i64 -> v4i32).
5381 // Try to convert vector shl/sra/srl before type legalization so that range
5382 // metadata can be utilized.
5383 if (!(N->getValueType(0).isVector() &&
5386 break;
5387 if (N->getOpcode() == ISD::SHL)
5388 return performShlCombine(N, DCI);
5389 if (N->getOpcode() == ISD::SRA)
5390 return performSraCombine(N, DCI);
5391 return performSrlCombine(N, DCI);
5392 }
5393 case ISD::TRUNCATE:
5394 return performTruncateCombine(N, DCI);
5395 case ISD::MUL:
5396 return performMulCombine(N, DCI);
5397 case AMDGPUISD::MUL_U24:
5398 case AMDGPUISD::MUL_I24: {
5399 if (SDValue Simplified = simplifyMul24(N, DCI))
5400 return Simplified;
5401 break;
5402 }
5405 return simplifyMul24(N, DCI);
5406 case ISD::SMUL_LOHI:
5407 case ISD::UMUL_LOHI:
5408 return performMulLoHiCombine(N, DCI);
5409 case ISD::MULHS:
5410 return performMulhsCombine(N, DCI);
5411 case ISD::MULHU:
5412 return performMulhuCombine(N, DCI);
5413 case ISD::SELECT:
5414 return performSelectCombine(N, DCI);
5415 case ISD::FNEG:
5416 return performFNegCombine(N, DCI);
5417 case ISD::FABS:
5418 return performFAbsCombine(N, DCI);
5419 case AMDGPUISD::BFE_I32:
5420 case AMDGPUISD::BFE_U32: {
5421 assert(!N->getValueType(0).isVector() &&
5422 "Vector handling of BFE not implemented");
5423 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
5424 if (!Width)
5425 break;
5426
5427 uint32_t WidthVal = Width->getZExtValue() & 0x1f;
5428 if (WidthVal == 0)
5429 return DAG.getConstant(0, DL, MVT::i32);
5430
5431 ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
5432 if (!Offset)
5433 break;
5434
5435 SDValue BitsFrom = N->getOperand(0);
5436 uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;
5437
5438 bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
5439
5440 if (OffsetVal == 0) {
5441 // This is already sign / zero extended, so try to fold away extra BFEs.
5442 unsigned SignBits = Signed ? (32 - WidthVal + 1) : (32 - WidthVal);
5443
5444 unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom);
5445 if (OpSignBits >= SignBits)
5446 return BitsFrom;
5447
5448 EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal);
5449 if (Signed) {
5450 // This is a sign_extend_inreg. Replace it to take advantage of existing
5451 // DAG Combines. If not eliminated, we will match back to BFE during
5452 // selection.
5453
5454 // TODO: The sext_inreg of extended types ends, although we can could
5455 // handle them in a single BFE.
5456 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom,
5457 DAG.getValueType(SmallVT));
5458 }
5459
5460 return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT);
5461 }
5462
5463 if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(BitsFrom)) {
5464 if (Signed) {
5465 return constantFoldBFE<int32_t>(DAG,
5466 CVal->getSExtValue(),
5467 OffsetVal,
5468 WidthVal,
5469 DL);
5470 }
5471
5472 return constantFoldBFE<uint32_t>(DAG,
5473 CVal->getZExtValue(),
5474 OffsetVal,
5475 WidthVal,
5476 DL);
5477 }
5478
5479 if ((OffsetVal + WidthVal) >= 32 &&
5480 !(Subtarget->hasSDWA() && OffsetVal == 16 && WidthVal == 16)) {
5481 SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32);
5482 return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
5483 BitsFrom, ShiftVal);
5484 }
5485
5486 if (BitsFrom.hasOneUse()) {
5487 APInt Demanded = APInt::getBitsSet(32,
5488 OffsetVal,
5489 OffsetVal + WidthVal);
5490
5491 KnownBits Known;
5493 !DCI.isBeforeLegalizeOps());
5494 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
5495 if (TLI.ShrinkDemandedConstant(BitsFrom, Demanded, TLO) ||
5496 TLI.SimplifyDemandedBits(BitsFrom, Demanded, Known, TLO)) {
5497 DCI.CommitTargetLoweringOpt(TLO);
5498 }
5499 }
5500
5501 break;
5502 }
5503 case ISD::LOAD:
5504 return performLoadCombine(N, DCI);
5505 case ISD::STORE:
5506 return performStoreCombine(N, DCI);
5507 case AMDGPUISD::RCP:
5509 return performRcpCombine(N, DCI);
5510 case ISD::AssertZext:
5511 case ISD::AssertSext:
5512 return performAssertSZExtCombine(N, DCI);
5514 return performIntrinsicWOChainCombine(N, DCI);
5515 case AMDGPUISD::FMAD_FTZ: {
5516 SDValue N0 = N->getOperand(0);
5517 SDValue N1 = N->getOperand(1);
5518 SDValue N2 = N->getOperand(2);
5519 EVT VT = N->getValueType(0);
5520
5521 // FMAD_FTZ is a FMAD + flush denormals to zero.
5522 // We flush the inputs, the intermediate step, and the output.
5523 ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
5524 ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
5525 ConstantFPSDNode *N2CFP = dyn_cast<ConstantFPSDNode>(N2);
5526 if (N0CFP && N1CFP && N2CFP) {
5527 const auto FTZ = [](const APFloat &V) {
5528 if (V.isDenormal()) {
5529 APFloat Zero(V.getSemantics(), 0);
5530 return V.isNegative() ? -Zero : Zero;
5531 }
5532 return V;
5533 };
5534
5535 APFloat V0 = FTZ(N0CFP->getValueAPF());
5536 APFloat V1 = FTZ(N1CFP->getValueAPF());
5537 APFloat V2 = FTZ(N2CFP->getValueAPF());
5539 V0 = FTZ(V0);
5541 return DAG.getConstantFP(FTZ(V0), DL, VT);
5542 }
5543 break;
5544 }
5545 }
5546 return SDValue();
5547}
5548
5549//===----------------------------------------------------------------------===//
5550// Helper functions
5551//===----------------------------------------------------------------------===//
5552
5554 const TargetRegisterClass *RC,
5555 Register Reg, EVT VT,
5556 const SDLoc &SL,
5557 bool RawReg) const {
5560 Register VReg;
5561
5562 if (!MRI.isLiveIn(Reg)) {
5563 VReg = MRI.createVirtualRegister(RC);
5564 MRI.addLiveIn(Reg, VReg);
5565 } else {
5566 VReg = MRI.getLiveInVirtReg(Reg);
5567 }
5568
5569 if (RawReg)
5570 return DAG.getRegister(VReg, VT);
5571
5572 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT);
5573}
5574
5575// This may be called multiple times, and nothing prevents creating multiple
5576// objects at the same offset. See if we already defined this object.
5578 int64_t Offset) {
5579 for (int I = MFI.getObjectIndexBegin(); I < 0; ++I) {
5580 if (MFI.getObjectOffset(I) == Offset) {
5581 assert(MFI.getObjectSize(I) == Size);
5582 return I;
5583 }
5584 }
5585
5586 return MFI.CreateFixedObject(Size, Offset, true);
5587}
5588
5590 EVT VT,
5591 const SDLoc &SL,
5592 int64_t Offset) const {
5594 MachineFrameInfo &MFI = MF.getFrameInfo();
5595 int FI = getOrCreateFixedStackObject(MFI, VT.getStoreSize(), Offset);
5596
5597 auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset);
5598 SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32);
5599
5600 return DAG.getLoad(VT, SL, DAG.getEntryNode(), Ptr, SrcPtrInfo, Align(4),
5603}
5604
5606 const SDLoc &SL,
5607 SDValue Chain,
5608 SDValue ArgVal,
5609 int64_t Offset) const {
5613
5614 SDValue Ptr = DAG.getConstant(Offset, SL, MVT::i32);
5615 // Stores to the argument stack area are relative to the stack pointer.
5616 SDValue SP =
5617 DAG.getCopyFromReg(Chain, SL, Info->getStackPtrOffsetReg(), MVT::i32);
5618 Ptr = DAG.getNode(ISD::ADD, SL, MVT::i32, SP, Ptr);
5619 SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, Align(4),
5621 return Store;
5622}
5623
5625 const TargetRegisterClass *RC,
5626 EVT VT, const SDLoc &SL,
5627 const ArgDescriptor &Arg) const {
5628 assert(Arg && "Attempting to load missing argument");
5629
5630 SDValue V = Arg.isRegister() ?
5631 CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL) :
5632 loadStackInputValue(DAG, VT, SL, Arg.getStackOffset());
5633
5634 if (!Arg.isMasked())
5635 return V;
5636
5637 unsigned Mask = Arg.getMask();
5638 unsigned Shift = llvm::countr_zero<unsigned>(Mask);
5639 V = DAG.getNode(ISD::SRL, SL, VT, V,
5640 DAG.getShiftAmountConstant(Shift, VT, SL));
5641 return DAG.getNode(ISD::AND, SL, VT, V,
5642 DAG.getConstant(Mask >> Shift, SL, VT));
5643}
5644
5646 uint64_t ExplicitKernArgSize, const ImplicitParameter Param) const {
5647 unsigned ExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
5648 const Align Alignment = Subtarget->getAlignmentForImplicitArgPtr();
5649 uint64_t ArgOffset =
5650 alignTo(ExplicitKernArgSize, Alignment) + ExplicitArgOffset;
5651 switch (Param) {
5652 case FIRST_IMPLICIT:
5653 return ArgOffset;
5654 case PRIVATE_BASE:
5656 case SHARED_BASE:
5657 return ArgOffset + AMDGPU::ImplicitArg::SHARED_BASE_OFFSET;
5658 case QUEUE_PTR:
5659 return ArgOffset + AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET;
5660 }
5661 llvm_unreachable("unexpected implicit parameter type");
5662}
5663
5665 const MachineFunction &MF, const ImplicitParameter Param) const {
5668}
5669
5670#define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node;
5671
5672const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
5673 switch ((AMDGPUISD::NodeType)Opcode) {
5674 case AMDGPUISD::FIRST_NUMBER: break;
5675 // AMDIL DAG nodes
5676 NODE_NAME_CASE(BRANCH_COND);
5677
5678 // AMDGPU DAG nodes
5679 NODE_NAME_CASE(IF)
5680 NODE_NAME_CASE(ELSE)
5681 NODE_NAME_CASE(LOOP)
5682 NODE_NAME_CASE(CALL)
5683 NODE_NAME_CASE(TC_RETURN)
5684 NODE_NAME_CASE(TC_RETURN_GFX)
5685 NODE_NAME_CASE(TC_RETURN_CHAIN)
5686 NODE_NAME_CASE(TC_RETURN_CHAIN_DVGPR)
5687 NODE_NAME_CASE(TRAP)
5688 NODE_NAME_CASE(RET_GLUE)
5689 NODE_NAME_CASE(WAVE_ADDRESS)
5690 NODE_NAME_CASE(RETURN_TO_EPILOG)
5691 NODE_NAME_CASE(ENDPGM)
5692 NODE_NAME_CASE(ENDPGM_TRAP)
5693 NODE_NAME_CASE(SIMULATED_TRAP)
5694 NODE_NAME_CASE(DWORDADDR)
5695 NODE_NAME_CASE(FRACT)
5696 NODE_NAME_CASE(SETCC)
5697 NODE_NAME_CASE(DENORM_MODE)
5698 NODE_NAME_CASE(FMA_W_CHAIN)
5699 NODE_NAME_CASE(FMUL_W_CHAIN)
5700 NODE_NAME_CASE(CLAMP)
5701 NODE_NAME_CASE(COS_HW)
5702 NODE_NAME_CASE(SIN_HW)
5703 NODE_NAME_CASE(FMAX_LEGACY)
5704 NODE_NAME_CASE(FMIN_LEGACY)
5705 NODE_NAME_CASE(FMAX3)
5706 NODE_NAME_CASE(SMAX3)
5707 NODE_NAME_CASE(UMAX3)
5708 NODE_NAME_CASE(FMIN3)
5709 NODE_NAME_CASE(SMIN3)
5710 NODE_NAME_CASE(UMIN3)
5711 NODE_NAME_CASE(FMED3)
5712 NODE_NAME_CASE(SMED3)
5713 NODE_NAME_CASE(UMED3)
5714 NODE_NAME_CASE(FMAXIMUM3)
5715 NODE_NAME_CASE(FMINIMUM3)
5716 NODE_NAME_CASE(FDOT2)
5717 NODE_NAME_CASE(URECIP)
5718 NODE_NAME_CASE(DIV_SCALE)
5719 NODE_NAME_CASE(DIV_FMAS)
5720 NODE_NAME_CASE(DIV_FIXUP)
5721 NODE_NAME_CASE(FMAD_FTZ)
5722 NODE_NAME_CASE(RCP)
5723 NODE_NAME_CASE(RSQ)
5724 NODE_NAME_CASE(RCP_LEGACY)
5725 NODE_NAME_CASE(RCP_IFLAG)
5726 NODE_NAME_CASE(LOG)
5727 NODE_NAME_CASE(EXP)
5728 NODE_NAME_CASE(FMUL_LEGACY)
5729 NODE_NAME_CASE(RSQ_CLAMP)
5730 NODE_NAME_CASE(FP_CLASS)
5731 NODE_NAME_CASE(DOT4)
5732 NODE_NAME_CASE(CARRY)
5733 NODE_NAME_CASE(BORROW)
5734 NODE_NAME_CASE(BFE_U32)
5735 NODE_NAME_CASE(BFE_I32)
5736 NODE_NAME_CASE(BFI)
5737 NODE_NAME_CASE(BFM)
5738 NODE_NAME_CASE(FFBH_U32)
5739 NODE_NAME_CASE(FFBH_I32)
5740 NODE_NAME_CASE(FFBL_B32)
5741 NODE_NAME_CASE(MUL_U24)
5742 NODE_NAME_CASE(MUL_I24)
5743 NODE_NAME_CASE(MULHI_U24)
5744 NODE_NAME_CASE(MULHI_I24)
5745 NODE_NAME_CASE(MAD_U24)
5746 NODE_NAME_CASE(MAD_I24)
5747 NODE_NAME_CASE(MAD_I64_I32)
5748 NODE_NAME_CASE(MAD_U64_U32)
5749 NODE_NAME_CASE(PERM)
5750 NODE_NAME_CASE(TEXTURE_FETCH)
5751 NODE_NAME_CASE(R600_EXPORT)
5752 NODE_NAME_CASE(CONST_ADDRESS)
5753 NODE_NAME_CASE(REGISTER_LOAD)
5754 NODE_NAME_CASE(REGISTER_STORE)
5755 NODE_NAME_CASE(CVT_F32_UBYTE0)
5756 NODE_NAME_CASE(CVT_F32_UBYTE1)
5757 NODE_NAME_CASE(CVT_F32_UBYTE2)
5758 NODE_NAME_CASE(CVT_F32_UBYTE3)
5759 NODE_NAME_CASE(CVT_PKRTZ_F16_F32)
5760 NODE_NAME_CASE(CVT_PKNORM_I16_F32)
5761 NODE_NAME_CASE(CVT_PKNORM_U16_F32)
5762 NODE_NAME_CASE(CVT_PK_I16_I32)
5763 NODE_NAME_CASE(CVT_PK_U16_U32)
5764 NODE_NAME_CASE(FP_TO_FP16)
5765 NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
5766 NODE_NAME_CASE(CONST_DATA_PTR)
5767 NODE_NAME_CASE(PC_ADD_REL_OFFSET)
5768 NODE_NAME_CASE(PC_ADD_REL_OFFSET64)
5770 NODE_NAME_CASE(DUMMY_CHAIN)
5771 NODE_NAME_CASE(LOAD_D16_HI)
5772 NODE_NAME_CASE(LOAD_D16_LO)
5773 NODE_NAME_CASE(LOAD_D16_HI_I8)
5774 NODE_NAME_CASE(LOAD_D16_HI_U8)
5775 NODE_NAME_CASE(LOAD_D16_LO_I8)
5776 NODE_NAME_CASE(LOAD_D16_LO_U8)
5777 NODE_NAME_CASE(STORE_MSKOR)
5778 NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
5779 NODE_NAME_CASE(TBUFFER_STORE_FORMAT_D16)
5780 NODE_NAME_CASE(TBUFFER_LOAD_FORMAT)
5781 NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16)
5782 NODE_NAME_CASE(DS_ORDERED_COUNT)
5783 NODE_NAME_CASE(ATOMIC_CMP_SWAP)
5784 NODE_NAME_CASE(BUFFER_LOAD)
5785 NODE_NAME_CASE(BUFFER_LOAD_UBYTE)
5786 NODE_NAME_CASE(BUFFER_LOAD_USHORT)
5787 NODE_NAME_CASE(BUFFER_LOAD_BYTE)
5788 NODE_NAME_CASE(BUFFER_LOAD_SHORT)
5789 NODE_NAME_CASE(BUFFER_LOAD_TFE)
5790 NODE_NAME_CASE(BUFFER_LOAD_UBYTE_TFE)
5791 NODE_NAME_CASE(BUFFER_LOAD_USHORT_TFE)
5792 NODE_NAME_CASE(BUFFER_LOAD_BYTE_TFE)
5793 NODE_NAME_CASE(BUFFER_LOAD_SHORT_TFE)
5794 NODE_NAME_CASE(BUFFER_LOAD_FORMAT)
5795 NODE_NAME_CASE(BUFFER_LOAD_FORMAT_TFE)
5796 NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16)
5797 NODE_NAME_CASE(SBUFFER_LOAD)
5798 NODE_NAME_CASE(SBUFFER_LOAD_BYTE)
5799 NODE_NAME_CASE(SBUFFER_LOAD_UBYTE)
5800 NODE_NAME_CASE(SBUFFER_LOAD_SHORT)
5801 NODE_NAME_CASE(SBUFFER_LOAD_USHORT)
5802 NODE_NAME_CASE(SBUFFER_PREFETCH_DATA)
5803 NODE_NAME_CASE(BUFFER_STORE)
5804 NODE_NAME_CASE(BUFFER_STORE_BYTE)
5805 NODE_NAME_CASE(BUFFER_STORE_SHORT)
5806 NODE_NAME_CASE(BUFFER_STORE_FORMAT)
5807 NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16)
5808 NODE_NAME_CASE(BUFFER_ATOMIC_SWAP)
5809 NODE_NAME_CASE(BUFFER_ATOMIC_ADD)
5810 NODE_NAME_CASE(BUFFER_ATOMIC_SUB)
5811 NODE_NAME_CASE(BUFFER_ATOMIC_SMIN)
5812 NODE_NAME_CASE(BUFFER_ATOMIC_UMIN)
5813 NODE_NAME_CASE(BUFFER_ATOMIC_SMAX)
5814 NODE_NAME_CASE(BUFFER_ATOMIC_UMAX)
5815 NODE_NAME_CASE(BUFFER_ATOMIC_AND)
5816 NODE_NAME_CASE(BUFFER_ATOMIC_OR)
5817 NODE_NAME_CASE(BUFFER_ATOMIC_XOR)
5818 NODE_NAME_CASE(BUFFER_ATOMIC_INC)
5819 NODE_NAME_CASE(BUFFER_ATOMIC_DEC)
5820 NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP)
5821 NODE_NAME_CASE(BUFFER_ATOMIC_CSUB)
5822 NODE_NAME_CASE(BUFFER_ATOMIC_FADD)
5823 NODE_NAME_CASE(BUFFER_ATOMIC_FMIN)
5824 NODE_NAME_CASE(BUFFER_ATOMIC_FMAX)
5825 NODE_NAME_CASE(BUFFER_ATOMIC_COND_SUB_U32)
5826 NODE_NAME_CASE(WHOLE_WAVE_SETUP)
5827 NODE_NAME_CASE(WHOLE_WAVE_RETURN)
5828 }
5829 return nullptr;
5830}
5831
5833 SelectionDAG &DAG, int Enabled,
5834 int &RefinementSteps,
5835 bool &UseOneConstNR,
5836 bool Reciprocal) const {
5837 EVT VT = Operand.getValueType();
5838
5839 if (VT == MVT::f32) {
5840 RefinementSteps = 0;
5841 return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand);
5842 }
5843
5844 // TODO: There is also f64 rsq instruction, but the documentation is less
5845 // clear on its precision.
5846
5847 return SDValue();
5848}
5849
5851 SelectionDAG &DAG, int Enabled,
5852 int &RefinementSteps) const {
5853 EVT VT = Operand.getValueType();
5854
5855 if (VT == MVT::f32) {
5856 // Reciprocal, < 1 ulp error.
5857 //
5858 // This reciprocal approximation converges to < 0.5 ulp error with one
5859 // newton rhapson performed with two fused multiple adds (FMAs).
5860
5861 RefinementSteps = 0;
5862 return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand);
5863 }
5864
5865 // TODO: There is also f64 rcp instruction, but the documentation is less
5866 // clear on its precision.
5867
5868 return SDValue();
5869}
5870
5871static unsigned workitemIntrinsicDim(unsigned ID) {
5872 switch (ID) {
5873 case Intrinsic::amdgcn_workitem_id_x:
5874 return 0;
5875 case Intrinsic::amdgcn_workitem_id_y:
5876 return 1;
5877 case Intrinsic::amdgcn_workitem_id_z:
5878 return 2;
5879 default:
5880 llvm_unreachable("not a workitem intrinsic");
5881 }
5882}
5883
5885 const SDValue Op, KnownBits &Known,
5886 const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
5887
5888 Known.resetAll(); // Don't know anything.
5889
5890 unsigned Opc = Op.getOpcode();
5891
5892 switch (Opc) {
5893 default:
5894 break;
5895 case AMDGPUISD::CARRY:
5896 case AMDGPUISD::BORROW: {
5897 Known.Zero = APInt::getHighBitsSet(32, 31);
5898 break;
5899 }
5900
5901 case AMDGPUISD::BFE_I32:
5902 case AMDGPUISD::BFE_U32: {
5903 ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5904 if (!CWidth)
5905 return;
5906
5907 uint32_t Width = CWidth->getZExtValue() & 0x1f;
5908
5909 if (Opc == AMDGPUISD::BFE_U32)
5910 Known.Zero = APInt::getHighBitsSet(32, 32 - Width);
5911
5912 break;
5913 }
5914 case AMDGPUISD::FP_TO_FP16: {
5915 unsigned BitWidth = Known.getBitWidth();
5916
5917 // High bits are zero.
5919 break;
5920 }
5921 case AMDGPUISD::MUL_U24:
5922 case AMDGPUISD::MUL_I24: {
5923 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
5924 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
5925 unsigned TrailZ = LHSKnown.countMinTrailingZeros() +
5926 RHSKnown.countMinTrailingZeros();
5927 Known.Zero.setLowBits(std::min(TrailZ, 32u));
5928 // Skip extra check if all bits are known zeros.
5929 if (TrailZ >= 32)
5930 break;
5931
5932 // Truncate to 24 bits.
5933 LHSKnown = LHSKnown.trunc(24);
5934 RHSKnown = RHSKnown.trunc(24);
5935
5936 if (Opc == AMDGPUISD::MUL_I24) {
5937 unsigned LHSValBits = LHSKnown.countMaxSignificantBits();
5938 unsigned RHSValBits = RHSKnown.countMaxSignificantBits();
5939 unsigned MaxValBits = LHSValBits + RHSValBits;
5940 if (MaxValBits > 32)
5941 break;
5942 unsigned SignBits = 32 - MaxValBits + 1;
5943 bool LHSNegative = LHSKnown.isNegative();
5944 bool LHSNonNegative = LHSKnown.isNonNegative();
5945 bool LHSPositive = LHSKnown.isStrictlyPositive();
5946 bool RHSNegative = RHSKnown.isNegative();
5947 bool RHSNonNegative = RHSKnown.isNonNegative();
5948 bool RHSPositive = RHSKnown.isStrictlyPositive();
5949
5950 if ((LHSNonNegative && RHSNonNegative) || (LHSNegative && RHSNegative))
5951 Known.Zero.setHighBits(SignBits);
5952 else if ((LHSNegative && RHSPositive) || (LHSPositive && RHSNegative))
5953 Known.One.setHighBits(SignBits);
5954 } else {
5955 unsigned LHSValBits = LHSKnown.countMaxActiveBits();
5956 unsigned RHSValBits = RHSKnown.countMaxActiveBits();
5957 unsigned MaxValBits = LHSValBits + RHSValBits;
5958 if (MaxValBits >= 32)
5959 break;
5960 Known.Zero.setBitsFrom(MaxValBits);
5961 }
5962 break;
5963 }
5964 case AMDGPUISD::PERM: {
5965 ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5966 if (!CMask)
5967 return;
5968
5969 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
5970 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
5971 unsigned Sel = CMask->getZExtValue();
5972
5973 for (unsigned I = 0; I < 32; I += 8) {
5974 unsigned SelBits = Sel & 0xff;
5975 if (SelBits < 4) {
5976 SelBits *= 8;
5977 Known.One |= ((RHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
5978 Known.Zero |= ((RHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
5979 } else if (SelBits < 7) {
5980 SelBits = (SelBits & 3) * 8;
5981 Known.One |= ((LHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
5982 Known.Zero |= ((LHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
5983 } else if (SelBits == 0x0c) {
5984 Known.Zero |= 0xFFull << I;
5985 } else if (SelBits > 0x0c) {
5986 Known.One |= 0xFFull << I;
5987 }
5988 Sel >>= 8;
5989 }
5990 break;
5991 }
5993 Known.Zero.setHighBits(24);
5994 break;
5995 }
5997 Known.Zero.setHighBits(16);
5998 break;
5999 }
6000 case AMDGPUISD::LDS: {
6001 auto *GA = cast<GlobalAddressSDNode>(Op.getOperand(0).getNode());
6002 Align Alignment = GA->getGlobal()->getPointerAlignment(DAG.getDataLayout());
6003
6004 Known.Zero.setHighBits(16);
6005 Known.Zero.setLowBits(Log2(Alignment));
6006 break;
6007 }
6008 case AMDGPUISD::SMIN3:
6009 case AMDGPUISD::SMAX3:
6010 case AMDGPUISD::SMED3:
6011 case AMDGPUISD::UMIN3:
6012 case AMDGPUISD::UMAX3:
6013 case AMDGPUISD::UMED3: {
6014 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
6015 if (Known2.isUnknown())
6016 break;
6017
6018 KnownBits Known1 = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
6019 if (Known1.isUnknown())
6020 break;
6021
6022 KnownBits Known0 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
6023 if (Known0.isUnknown())
6024 break;
6025
6026 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
6027 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
6028 Known.One = Known0.One & Known1.One & Known2.One;
6029 break;
6030 }
6032 unsigned IID = Op.getConstantOperandVal(0);
6033 switch (IID) {
6034 case Intrinsic::amdgcn_workitem_id_x:
6035 case Intrinsic::amdgcn_workitem_id_y:
6036 case Intrinsic::amdgcn_workitem_id_z: {
6037 unsigned MaxValue = Subtarget->getMaxWorkitemID(
6039 Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
6040 break;
6041 }
6042 default:
6043 break;
6044 }
6045 }
6046 }
6047}
6048
6050 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
6051 unsigned Depth) const {
6052 switch (Op.getOpcode()) {
6053 case AMDGPUISD::BFE_I32: {
6054 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
6055 if (!Width)
6056 return 1;
6057
6058 unsigned SignBits = 32 - Width->getZExtValue() + 1;
6059 if (!isNullConstant(Op.getOperand(1)))
6060 return SignBits;
6061
6062 // TODO: Could probably figure something out with non-0 offsets.
6063 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
6064 return std::max(SignBits, Op0SignBits);
6065 }
6066
6067 case AMDGPUISD::BFE_U32: {
6068 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
6069 return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1;
6070 }
6071
6072 case AMDGPUISD::CARRY:
6073 case AMDGPUISD::BORROW:
6074 return 31;
6076 return 25;
6078 return 17;
6080 return 24;
6082 return 16;
6084 return 16;
6085 case AMDGPUISD::SMIN3:
6086 case AMDGPUISD::SMAX3:
6087 case AMDGPUISD::SMED3:
6088 case AMDGPUISD::UMIN3:
6089 case AMDGPUISD::UMAX3:
6090 case AMDGPUISD::UMED3: {
6091 unsigned Tmp2 = DAG.ComputeNumSignBits(Op.getOperand(2), Depth + 1);
6092 if (Tmp2 == 1)
6093 return 1; // Early out.
6094
6095 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth + 1);
6096 if (Tmp1 == 1)
6097 return 1; // Early out.
6098
6099 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
6100 if (Tmp0 == 1)
6101 return 1; // Early out.
6102
6103 return std::min({Tmp0, Tmp1, Tmp2});
6104 }
6105 default:
6106 return 1;
6107 }
6108}
6109
6111 GISelValueTracking &Analysis, Register R, const APInt &DemandedElts,
6112 const MachineRegisterInfo &MRI, unsigned Depth) const {
6113 const MachineInstr *MI = MRI.getVRegDef(R);
6114 if (!MI)
6115 return 1;
6116
6117 // TODO: Check range metadata on MMO.
6118 switch (MI->getOpcode()) {
6119 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
6120 return 25;
6121 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
6122 return 17;
6123 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
6124 return 24;
6125 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
6126 return 16;
6127 case AMDGPU::G_AMDGPU_SMED3:
6128 case AMDGPU::G_AMDGPU_UMED3: {
6129 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
6130 unsigned Tmp2 = Analysis.computeNumSignBits(Src2, DemandedElts, Depth + 1);
6131 if (Tmp2 == 1)
6132 return 1;
6133 unsigned Tmp1 = Analysis.computeNumSignBits(Src1, DemandedElts, Depth + 1);
6134 if (Tmp1 == 1)
6135 return 1;
6136 unsigned Tmp0 = Analysis.computeNumSignBits(Src0, DemandedElts, Depth + 1);
6137 if (Tmp0 == 1)
6138 return 1;
6139 return std::min({Tmp0, Tmp1, Tmp2});
6140 }
6141 default:
6142 return 1;
6143 }
6144}
6145
6147 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
6148 bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
6149 unsigned Opcode = Op.getOpcode();
6150 switch (Opcode) {
6151 case AMDGPUISD::BFE_I32:
6152 case AMDGPUISD::BFE_U32:
6153 return false;
6154 }
6156 Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth);
6157}
6158
6160 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN,
6161 unsigned Depth) const {
6162 unsigned Opcode = Op.getOpcode();
6163 switch (Opcode) {
6166 if (SNaN)
6167 return true;
6168
6169 // TODO: Can check no nans on one of the operands for each one, but which
6170 // one?
6171 return false;
6172 }
6175 if (SNaN)
6176 return true;
6177 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
6178 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
6179 }
6180 case AMDGPUISD::FMED3:
6181 case AMDGPUISD::FMIN3:
6182 case AMDGPUISD::FMAX3:
6185 case AMDGPUISD::FMAD_FTZ: {
6186 if (SNaN)
6187 return true;
6188 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
6189 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
6190 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
6191 }
6196 return true;
6197
6198 case AMDGPUISD::RCP:
6199 case AMDGPUISD::RSQ:
6201 case AMDGPUISD::RSQ_CLAMP: {
6202 if (SNaN)
6203 return true;
6204
6205 // TODO: Need is known positive check.
6206 return false;
6207 }
6208 case ISD::FLDEXP:
6209 case AMDGPUISD::FRACT: {
6210 if (SNaN)
6211 return true;
6212 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
6213 }
6217 // TODO: Refine on operands.
6218 return SNaN;
6219 case AMDGPUISD::SIN_HW:
6220 case AMDGPUISD::COS_HW: {
6221 // TODO: Need check for infinity
6222 return SNaN;
6223 }
6225 unsigned IntrinsicID = Op.getConstantOperandVal(0);
6226 // TODO: Handle more intrinsics
6227 switch (IntrinsicID) {
6228 case Intrinsic::amdgcn_cubeid:
6229 case Intrinsic::amdgcn_cvt_off_f32_i4:
6230 return true;
6231
6232 case Intrinsic::amdgcn_frexp_mant: {
6233 if (SNaN)
6234 return true;
6235 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
6236 }
6237 case Intrinsic::amdgcn_cvt_pkrtz: {
6238 if (SNaN)
6239 return true;
6240 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
6241 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
6242 }
6243 case Intrinsic::amdgcn_rcp:
6244 case Intrinsic::amdgcn_rsq:
6245 case Intrinsic::amdgcn_rcp_legacy:
6246 case Intrinsic::amdgcn_rsq_legacy:
6247 case Intrinsic::amdgcn_rsq_clamp:
6248 case Intrinsic::amdgcn_tanh: {
6249 if (SNaN)
6250 return true;
6251
6252 // TODO: Need is known positive check.
6253 return false;
6254 }
6255 case Intrinsic::amdgcn_trig_preop:
6256 case Intrinsic::amdgcn_fdot2:
6257 // TODO: Refine on operand
6258 return SNaN;
6259 case Intrinsic::amdgcn_fma_legacy:
6260 if (SNaN)
6261 return true;
6262 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
6263 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1) &&
6264 DAG.isKnownNeverNaN(Op.getOperand(3), SNaN, Depth + 1);
6265 default:
6266 return false;
6267 }
6268 }
6269 default:
6270 return false;
6271 }
6272}
6273
6275 Register N0, Register N1) const {
6276 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
6277}
unsigned const MachineRegisterInfo * MRI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static LLVM_READONLY bool hasSourceMods(const MachineInstr &MI)
static bool isInv2Pi(const APFloat &APF)
static LLVM_READONLY bool opMustUseVOP3Encoding(const MachineInstr &MI, const MachineRegisterInfo &MRI)
returns true if the operation will definitely need to use a 64-bit encoding, and thus will use a VOP3...
static unsigned inverseMinMax(unsigned Opc)
static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, SelectionDAG &DAG)
static unsigned workitemIntrinsicDim(unsigned ID)
static int getOrCreateFixedStackObject(MachineFrameInfo &MFI, unsigned Size, int64_t Offset)
static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset, uint32_t Width, const SDLoc &DL)
static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X, SDValue Y, SDValue C, SDNodeFlags Flags=SDNodeFlags())
static SDValue getAddOneOp(const SDNode *V)
If V is an add of a constant 1, returns the other operand.
#define NODE_NAME_CASE(node)
static LLVM_READONLY bool selectSupportsSourceMods(const SDNode *N)
Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the type for ISD::SELECT.
static cl::opt< bool > AMDGPUBypassSlowDiv("amdgpu-bypass-slow-div", cl::desc("Skip 64-bit divide for dynamic 32-bit values"), cl::init(true))
static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL, SDValue N0, SDValue N1, unsigned Size, bool Signed)
static bool fnegFoldsIntoOp(const SDNode *N)
static bool isI24(SDValue Op, SelectionDAG &DAG)
static bool isCttzOpc(unsigned Opc)
static bool isU24(SDValue Op, SelectionDAG &DAG)
static SDValue peekFPSignOps(SDValue Val)
static bool valueIsKnownNeverF32Denorm(SDValue Src)
Return true if it's known that Src can never be an f32 denormal value.
static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI, unsigned Op, const SDLoc &SL, SDValue Cond, SDValue N1, SDValue N2)
static SDValue peekFNeg(SDValue Val)
static SDValue simplifyMul24(SDNode *Node24, TargetLowering::DAGCombinerInfo &DCI)
static bool isCtlzOpc(unsigned Opc)
static LLVM_READNONE bool fnegFoldsIntoOpcode(unsigned Opc)
static bool hasVolatileUser(SDNode *Val)
Interface definition of the TargetLowering class that is common to all AMD GPUs.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
AMDGPU promote alloca to vector or LDS
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
block Block Frequency Analysis
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define LLVM_READNONE
Definition: Compiler.h:315
#define LLVM_READONLY
Definition: Compiler.h:322
uint64_t Size
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
Provides analysis for querying information about KnownBits during GISel passes.
IRTranslator LLVM IR MI
static LVOptions Options
Definition: LVOptions.cpp:25
#define I(x, y, z)
Definition: MD5.cpp:58
#define G(x, y, z)
Definition: MD5.cpp:56
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first found DebugLoc that has a DILocation, given a range of instructions.
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
const SmallVectorImpl< MachineOperand > & Cond
#define CH(x, y, z)
Definition: SHA256.cpp:34
static bool Enabled
Definition: Statistic.cpp:46
Value * RHS
Value * LHS
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static std::optional< uint32_t > getLDSAbsoluteAddress(const GlobalValue &GV)
void recordNumNamedBarriers(uint32_t GVAddr, unsigned BarCnt)
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV)
bool hasFminFmaxLegacy() const
Align getAlignmentForImplicitArgPtr() const
bool hasMadMacF32Insts() const
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
bool has16BitInsts() const
bool hasFastFMAF32() const
unsigned getExplicitKernelArgOffset() const
Returns the offset in bytes from the start of the input buffer of the first explicit kernel argument.
static const AMDGPUSubtarget & get(const MachineFunction &MF)
bool hasInv2PiInlineImm() const
bool hasVOP3PInsts() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue combineFMinMaxLegacy(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True, SDValue False, SDValue CC, DAGCombinerInfo &DCI) const
Generate Min/Max node.
unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
This method can be implemented by targets that want to expose additional information about sign bits ...
SDValue performMulhuCombine(SDNode *N, DAGCombinerInfo &DCI) const
EVT getTypeForExtReturn(LLVMContext &Context, EVT VT, ISD::NodeType ExtendKind) const override
Return the type that should be used to zero or sign extend a zeroext/signext integer return value.
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const
SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue LowerF64ToF16Safe(SDValue Src, const SDLoc &DL, SelectionDAG &DAG) const
SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
bool storeOfVectorConstantIsCheap(bool IsZero, EVT MemVT, unsigned NumElem, unsigned AS) const override
Return true if it is expected to be cheaper to do a store of vector constant with the given size and ...
SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool shouldCombineMemoryType(EVT VT) const
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue performAssertSZExtCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isTruncateFree(EVT Src, EVT Dest) const override
bool aggressivelyPreferBuildVectorSources(EVT VecVT) const override
SDValue LowerFCEIL(SDValue Op, SelectionDAG &DAG) const
TargetLowering::NegatibleCost getConstantNegateCost(const ConstantFPSDNode *C) const
SDValue LowerFLOGUnsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, bool IsLog10, SDNodeFlags Flags) const
bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const override
Return true if Op can create undef or poison from non-undef & non-poison operands.
SDValue performMulhsCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isSDNodeAlwaysUniform(const SDNode *N) const override
bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to move this shift by a constant amount through its operand,...
SDValue LowerFREM(SDValue Op, SelectionDAG &DAG) const
Split a vector store into multiple scalar stores.
SDValue performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isCheapToSpeculateCtlz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic ctlz.
SDValue LowerSDIVREM(SDValue Op, SelectionDAG &DAG) const
bool isFNegFree(EVT VT) const override
Return true if an fneg operation is free to the point where it is never worthwhile to replace it with...
SDValue LowerFLOG10(SDValue Op, SelectionDAG &DAG) const
SDValue LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, bool Signed) const
unsigned computeNumSignBitsForTargetInstr(GISelValueTracking &Analysis, Register R, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
This method can be implemented by targets that want to expose additional information about sign bits ...
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
bool isConstantCheaperToNegate(SDValue N) const
bool isReassocProfitable(MachineRegisterInfo &MRI, Register N0, Register N1) const override
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const
SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue performFNegCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
bool isConstantCostlierToNegate(SDValue N) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
SDValue LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const
SDValue lowerFEXP10Unsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, SDNodeFlags Flags) const
Emit approx-funcs appropriate lowering for exp10.
bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtType, EVT ExtVT, std::optional< unsigned > ByteOffset) const override
Return true if it is profitable to reduce a load to a smaller type.
SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const
bool isCheapToSpeculateCttz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic cttz.
SDValue performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond, SDValue LHS, SDValue RHS, DAGCombinerInfo &DCI) const
SDValue performSraCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isSelectSupported(SelectSupportKind) const override
bool isZExtFree(Type *Src, Type *Dest) const override
Return true if any actual instruction that defines a value of type FromTy implicitly zero-extends the...
SDValue lowerFEXP2(SDValue Op, SelectionDAG &DAG) const
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
SDValue performSrlCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue lowerFEXP(SDValue Op, SelectionDAG &DAG) const
SDValue getIsLtSmallestNormal(SelectionDAG &DAG, SDValue Op, SDNodeFlags Flags) const
bool mayIgnoreSignedZero(SDValue Op) const
SDValue getIsFinite(SelectionDAG &DAG, SDValue Op, SDNodeFlags Flags) const
bool isLoadBitCastBeneficial(EVT, EVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const final
Return true if the following transform is beneficial: fold (conv (load x)) -> (load (conv*)x) On arch...
std::pair< SDValue, SDValue > splitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HighVT, SelectionDAG &DAG) const
Split a vector value into two parts of types LoVT and HiVT.
SDValue LowerFLOGCommon(SDValue Op, SelectionDAG &DAG) const
SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI, SDValue N) const
SDValue LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG, bool Signed) const
bool isFAbsFree(EVT VT) const override
Return true if an fabs operation is free to the point where it is never worthwhile to replace it with...
SDValue loadStackInputValue(SelectionDAG &DAG, EVT VT, const SDLoc &SL, int64_t Offset) const
Similar to CreateLiveInRegister, except value maybe loaded from a stack slot rather than passed in a ...
SDValue LowerFLOG2(SDValue Op, SelectionDAG &DAG) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &RefinementSteps, bool &UseOneConstNR, bool Reciprocal) const override
Hooks for building estimates in place of slower divisions and square roots.
SDValue performTruncateCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const
static SDValue stripBitcast(SDValue Val)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
SDValue LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const
SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize, NegatibleCost &Cost, unsigned Depth) const override
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &RefinementSteps) const override
Return a reciprocal estimate value for the input operand.
AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI)
SDValue LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const
const char * getTargetNodeName(unsigned Opcode) const override
This method returns the name of a target specific DAG node.
SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
std::pair< SDValue, SDValue > getScaledLogInput(SelectionDAG &DAG, const SDLoc SL, SDValue Op, SDNodeFlags Flags) const
If denormal handling is required return the scaled input to FLOG2, and the check for denormal range.
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold=4)
SDValue LowerFROUNDEVEN(SDValue Op, SelectionDAG &DAG) const
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override
Returns true if the target can instruction select the specified FP immediate natively.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
SDValue lowerFEXPUnsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, SDNodeFlags Flags) const
SDValue LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
bool ShouldShrinkFPConstant(EVT VT) const override
If true, then instruction selection should seek to shrink the FP constant of the specified type to a ...
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue getLoHalf64(SDValue Op, SelectionDAG &DAG) const
SDValue lowerCTLZResults(SDValue Op, SelectionDAG &DAG) const
SDValue performFAbsCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerFP_TO_INT64(SDValue Op, SelectionDAG &DAG, bool Signed) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
SDValue LowerFRINT(SDValue Op, SelectionDAG &DAG) const
SDValue performIntrinsicWOChainCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const
SDValue performMulLoHiCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
void LowerUDIVREM64(SDValue Op, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results) const
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
std::pair< EVT, EVT > getSplitDestVTs(const EVT &VT, SelectionDAG &DAG) const
Split a vector type into two parts.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
SDValue combineFMinMaxLegacyImpl(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True, SDValue False, SDValue CC, DAGCombinerInfo &DCI) const
unsigned getVectorIdxWidth(const DataLayout &) const override
Returns the type to be used for the index operand vector operations.
bool bitwiseIsEqual(const APFloat &RHS) const
Definition: APFloat.h:1414
opStatus add(const APFloat &RHS, roundingMode RM)
Definition: APFloat.h:1181
const fltSemantics & getSemantics() const
Definition: APFloat.h:1457
opStatus multiply(const APFloat &RHS, roundingMode RM)
Definition: APFloat.h:1199
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
Definition: APFloat.h:1158
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition: APFloat.h:1098
Class for arbitrary precision integers.
Definition: APInt.h:78
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1540
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition: APInt.h:1391
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition: APInt.h:1385
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition: APInt.h:258
bool ule(const APInt &RHS) const
Unsigned less or equal comparison.
Definition: APInt.h:1150
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition: APInt.h:306
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:296
void setLowBits(unsigned loBits)
Set the bottom loBits bits.
Definition: APInt.h:1388
This class represents an incoming formal argument to a Function.
Definition: Argument.h:32
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
LLVMContext & getContext() const
void addLoc(const CCValAssign &V)
static CCValAssign getCustomMem(unsigned ValNo, MVT ValVT, int64_t Offset, MVT LocVT, LocInfo HTP)
const APFloat & getValueAPF() const
bool isNegative() const
Return true if the value is negative.
uint64_t getZExtValue() const
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
Diagnostic information for unsupported feature in backend.
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition: Function.cpp:363
iterator_range< arg_iterator > args()
Definition: Function.h:890
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:270
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:663
Type * getValueType() const
Definition: GlobalValue.h:298
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:68
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
Machine Value Type.
static auto integer_fixedlen_vector_valuetypes()
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
static auto integer_valuetypes()
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
int getObjectIndexBegin() const
Return the minimum frame object index.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Representation of each machine instruction.
Definition: MachineInstr.h:72
A description of a memory reference used in the backend.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOInvariant
The memory access always returns the same value (or traps).
Flags getFlags() const
Return the raw flags of the source value,.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
bool isSimple() const
Returns true if the memory operation is neither atomic or volatile.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getChain() const
bool isInvariant() const
EVT getMemoryVT() const
Return the type of the in-memory value.
LLVMContext & getContext() const
Get the global data context.
Definition: Module.h:285
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
Represents one node in the SelectionDAG.
ArrayRef< SDUse > ops() const
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
SDNodeFlags getFlags() const
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
iterator_range< user_iterator > users()
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
unsigned getOpcode() const
unsigned getNumOperands() const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
SIModeRegisterDefaults getMode() const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:229
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI unsigned ComputeMaxSignificantBits(SDValue Op, unsigned Depth=0) const
Get the upper bound on bit size for this Value Op as a signed integer.
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
Definition: SelectionDAG.h:578
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
Definition: SelectionDAG.h:504
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
Definition: SelectionDAG.h:868
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
Definition: SelectionDAG.h:839
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
Definition: SelectionDAG.h:498
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
bool isConstantValueOfAnyType(SDValue N) const
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
LLVM_ABI SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:707
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:493
SDValue getPOISON(EVT VT)
Return a POISON node. POISON does not have a useful SDLoc.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
Definition: SelectionDAG.h:511
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
Definition: SelectionDAG.h:587
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition: SelectionDAG.h:581
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
size_t size() const
Definition: SmallVector.h:79
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:574
void push_back(const T &Elt)
Definition: SmallVector.h:414
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1197
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:55
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
virtual bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT, std::optional< unsigned > ByteOffset=std::nullopt) const
Return true if it is profitable to reduce a load to a smaller type.
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
unsigned MaxGluedStoresPerMemcpy
Specify max number of store instructions to glue in inlined memcpy.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void addBypassSlowDiv(unsigned int SlowBitWidth, unsigned int FastBitWidth)
Tells the code generator which bitwidths to bypass.
void setMaxLargeFPConvertBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum fp to/from int conversion the backend supports.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
SelectSupportKind
Enum that describes what type of support for selects the target has.
virtual bool allowsMisalignedMemoryAccesses(EVT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *=nullptr) const
Determine if the target supports unaligned memory accesses.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
EVT getShiftAmountTy(EVT LHSTy, const DataLayout &DL) const
Returns the type for the shift amount of a shift opcode.
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
virtual EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const
Return the ValueType of the result of SETCC operations.
virtual EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const
For types supported by the target, this is an identity function.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
void setSupportsUnalignedAtomics(bool UnalignedSupported)
Sets whether unaligned atomic operations are supported.
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
void setMinCmpXchgSizeInBits(unsigned SizeInBits)
Sets the minimum cmpxchg or ll/sc size supported by the backend.
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
unsigned GatherAllAliasesMaxDepth
Depth that GatherAllAliases should continue looking for chain dependencies when trying to find a more...
NegatibleCost
Enum that specifies when a float negation is beneficial.
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
void setJumpIsExpensive(bool isExpensive=true)
Tells the code generator not to expand logic operations on comparison predicates into separate sequen...
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
bool ShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const
Check to see if the specified operand of the specified instruction is a constant integer.
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
virtual SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOps, bool OptForSize, NegatibleCost &Cost, unsigned Depth=0) const
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const
Return true if Op can create undef or poison from non-undef & non-poison operands.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:83
TargetOptions Options
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:346
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
LLVM Value Representation.
Definition: Value.h:75
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:322
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
bool isIntrinsicAlwaysUniform(unsigned IntrID)
TargetExtType * isNamedBarrier(const GlobalVariable &GV)
bool isUniformMMO(const MachineMemOperand *MMO)
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
Definition: CallingConv.h:197
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
Definition: CallingConv.h:188
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ AMDGPU_Gfx
Used for AMD graphics targets.
Definition: CallingConv.h:232
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:249
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
Definition: CallingConv.h:206
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
Definition: CallingConv.h:191
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:245
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:194
@ Cold
Attempts to make code in the caller as efficient as possible under the assumption that the call is no...
Definition: CallingConv.h:47
@ SPIR_KERNEL
Used for SPIR kernel functions.
Definition: CallingConv.h:144
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
Definition: CallingConv.h:218
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
Definition: CallingConv.h:213
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition: ISDOpcodes.h:41
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:801
@ CTLZ_ZERO_UNDEF
Definition: ISDOpcodes.h:774
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:270
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition: ISDOpcodes.h:587
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:765
@ ConstantFP
Definition: ISDOpcodes.h:87
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, val, ptr) This corresponds to "store atomic" instruction.
Definition: ISDOpcodes.h:1351
@ ADDC
Carry-setting nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:289
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition: ISDOpcodes.h:515
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:259
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1141
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition: ISDOpcodes.h:835
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:511
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:862
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition: ISDOpcodes.h:571
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:410
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:275
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
Definition: ISDOpcodes.h:985
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:975
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition: ISDOpcodes.h:249
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
Definition: ISDOpcodes.h:1018
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:826
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition: ISDOpcodes.h:773
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:1002
@ BRIND
BRIND - Indirect branch.
Definition: ISDOpcodes.h:1162
@ BR_JT
BR_JT - Jumptable branch.
Definition: ISDOpcodes.h:1166
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition: ISDOpcodes.h:528
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
Definition: ISDOpcodes.h:535
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:778
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
Definition: ISDOpcodes.h:1347
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition: ISDOpcodes.h:242
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:695
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:756
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:636
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition: ISDOpcodes.h:601
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
Definition: ISDOpcodes.h:1075
@ EntryToken
EntryToken - This is the marker used to indicate the start of a region.
Definition: ISDOpcodes.h:48
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:563
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:219
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:832
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition: ISDOpcodes.h:793
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum maximum on two values, following IEEE-754 definition...
Definition: ISDOpcodes.h:1059
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
Definition: ISDOpcodes.h:1151
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:870
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:718
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:960
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition: ISDOpcodes.h:787
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:323
@ INLINEASM_BR
INLINEASM_BR - Branching version of inline asm. Used by asm-goto.
Definition: ISDOpcodes.h:1207
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:1081
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:908
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:730
@ TRAP
TRAP - Trapping instruction.
Definition: ISDOpcodes.h:1318
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:200
@ ADDE
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:299
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition: ISDOpcodes.h:552
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition: ISDOpcodes.h:53
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
Definition: ISDOpcodes.h:1025
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:941
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
Definition: ISDOpcodes.h:979
@ INLINEASM
INLINEASM - Represents an inline asm block.
Definition: ISDOpcodes.h:1204
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:838
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition: ISDOpcodes.h:62
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition: ISDOpcodes.h:521
@ AssertZext
Definition: ISDOpcodes.h:63
@ FMINIMUMNUM
FMINIMUMNUM/FMAXIMUMNUM - minimumnum/maximumnum that is same with FMINNUM_IEEE and FMAXNUM_IEEE besid...
Definition: ISDOpcodes.h:1086
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:208
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition: ISDOpcodes.h:543
bool isNormalStore(const SDNode *N)
Returns true if the specified node is a non-truncating and unindexed store.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1691
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
Definition: ISDOpcodes.h:1671
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:444
constexpr double ln2
Definition: MathExtras.h:49
constexpr double ln10
Definition: MathExtras.h:50
constexpr float log2ef
Definition: MathExtras.h:66
constexpr double log2e
Definition: MathExtras.h:51
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Offset
Definition: DWP.cpp:477
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1744
MaybeAlign getAlign(const CallInst &I, unsigned Index)
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:390
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition: bit.h:203
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:159
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:164
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
CombineLevel
Definition: DAGCombine.h:15
@ AfterLegalizeDAG
Definition: DAGCombine.h:19
@ BeforeLegalizeTypes
Definition: DAGCombine.h:16
@ AfterLegalizeTypes
Definition: DAGCombine.h:17
@ Mul
Product of integers.
@ Add
Sum of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
DWARFExpression::Operation Op
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition: Analysis.cpp:119
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:223
@ DS_Warning
LLVM_ABI bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
static cl::opt< int > CostThreshold("sbvec-cost-threshold", cl::init(0), cl::Hidden, cl::desc("Vectorization cost threshold."))
APFloat neg(APFloat X)
Returns the negated value of the argument.
Definition: APFloat.h:1569
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition: Alignment.h:208
LLVM_ABI bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
LLVM_ABI void reportFatalUsageError(Error Err)
Report a fatal error that does not indicate a bug in LLVM.
Definition: Error.cpp:180
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:853
#define N
static LLVM_ABI const fltSemantics & IEEEsingle() LLVM_READNONE
Definition: APFloat.cpp:266
static constexpr roundingMode rmNearestTiesToEven
Definition: APFloat.h:304
static LLVM_ABI const fltSemantics & IEEEdouble() LLVM_READNONE
Definition: APFloat.cpp:267
static LLVM_ABI const fltSemantics & IEEEhalf() LLVM_READNONE
Definition: APFloat.cpp:264
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
MCRegister getRegister() const
unsigned getStackOffset() const
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
static constexpr DenormalMode getPreserveSign()
Extended Value Type.
Definition: ValueTypes.h:35
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition: ValueTypes.h:390
EVT getPow2VectorType(LLVMContext &Context) const
Widens the length of the given vector EVT up to the nearest power of 2 and returns that type.
Definition: ValueTypes.h:472
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition: ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition: ValueTypes.h:121
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:147
EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const
Definition: ValueTypes.h:458
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:368
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition: ValueTypes.h:238
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:380
EVT getHalfSizedIntegerVT(LLVMContext &Context) const
Finds the smallest simple value type that is greater than or equal to half the width of this EVT.
Definition: ValueTypes.h:425
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition: ValueTypes.h:465
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition: ValueTypes.h:407
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:311
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition: ValueTypes.h:65
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition: ValueTypes.h:376
EVT getRoundIntegerType(LLVMContext &Context) const
Rounds the bit-width of the given integer EVT up to the nearest power of two (and at least to eight),...
Definition: ValueTypes.h:414
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:318
bool bitsGE(EVT VT) const
Return true if this has no less bits than VT.
Definition: ValueTypes.h:287
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:323
bool isExtended() const
Test if the given EVT is extended (as opposed to being simple).
Definition: ValueTypes.h:142
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition: ValueTypes.h:102
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
Definition: ValueTypes.cpp:330
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:331
bool bitsLE(EVT VT) const
Return true if this has no more bits than VT.
Definition: ValueTypes.h:303
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition: ValueTypes.h:152
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
MVT VT
Legalized type of this argument part.
bool isNonNegative() const
Returns true if this value is known to be non-negative.
Definition: KnownBits.h:101
unsigned countMinTrailingZeros() const
Returns the minimum number of trailing zero bits.
Definition: KnownBits.h:235
bool isUnknown() const
Returns true if we don't know any bits.
Definition: KnownBits.h:66
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
Definition: KnownBits.h:154
unsigned getBitWidth() const
Get the bit width of this value.
Definition: KnownBits.h:44
void resetAll()
Resets the known state of all bits.
Definition: KnownBits.h:74
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition: KnownBits.h:289
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition: KnownBits.h:241
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition: KnownBits.h:138
APInt getMinValue() const
Return the minimal unsigned value possible given these KnownBits.
Definition: KnownBits.h:122
bool isStrictlyPositive() const
Returns true if this value is known to be positive.
Definition: KnownBits.h:107
bool isNegative() const
Returns true if this value is known to be negative.
Definition: KnownBits.h:98
unsigned countMaxSignificantBits() const
Returns the maximum number of bits needed to represent all possible signed values with these known bi...
Definition: KnownBits.h:262
Matching combinators.
This class contains a discriminated union of information about pointers in memory operands,...
LLVM_ABI bool isDereferenceable(unsigned Size, LLVMContext &C, const DataLayout &DL) const
Return true if memory region [V, V+Offset+Size) is known to be dereferenceable.
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
MachinePointerInfo getWithOffset(int64_t O) const
These are IR-level optimization flags that may be propagated to SDNodes.
void setAllowContract(bool b)
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
LLVM_ABI void AddToWorklist(SDNode *N)
LLVM_ABI SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
LLVM_ABI void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO)
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...