LLVM 19.0.0git
AMDGPUISelLowering.cpp
Go to the documentation of this file.
1//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This is the parent TargetLowering class for hardware code gen
11/// targets.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPUISelLowering.h"
16#include "AMDGPU.h"
17#include "AMDGPUInstrInfo.h"
24#include "llvm/IR/IntrinsicsAMDGPU.h"
29
30using namespace llvm;
31
32#include "AMDGPUGenCallingConv.inc"
33
35 "amdgpu-bypass-slow-div",
36 cl::desc("Skip 64-bit divide for dynamic 32-bit values"),
37 cl::init(true));
38
39// Find a larger type to do a load / store of a vector with.
41 unsigned StoreSize = VT.getStoreSizeInBits();
42 if (StoreSize <= 32)
43 return EVT::getIntegerVT(Ctx, StoreSize);
44
45 assert(StoreSize % 32 == 0 && "Store size not a multiple of 32");
46 return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
47}
48
51}
52
54 // In order for this to be a signed 24-bit value, bit 23, must
55 // be a sign bit.
56 return DAG.ComputeMaxSignificantBits(Op);
57}
58
60 const AMDGPUSubtarget &STI)
61 : TargetLowering(TM), Subtarget(&STI) {
62 // Always lower memset, memcpy, and memmove intrinsics to load/store
63 // instructions, rather then generating calls to memset, mempcy or memmove.
67
68 // Lower floating point store/load to integer store/load to reduce the number
69 // of patterns in tablegen.
71 AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
72
74 AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
75
77 AddPromotedToType(ISD::LOAD, MVT::v3f32, MVT::v3i32);
78
80 AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
81
83 AddPromotedToType(ISD::LOAD, MVT::v5f32, MVT::v5i32);
84
86 AddPromotedToType(ISD::LOAD, MVT::v6f32, MVT::v6i32);
87
89 AddPromotedToType(ISD::LOAD, MVT::v7f32, MVT::v7i32);
90
92 AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
93
95 AddPromotedToType(ISD::LOAD, MVT::v9f32, MVT::v9i32);
96
98 AddPromotedToType(ISD::LOAD, MVT::v10f32, MVT::v10i32);
99
100 setOperationAction(ISD::LOAD, MVT::v11f32, Promote);
101 AddPromotedToType(ISD::LOAD, MVT::v11f32, MVT::v11i32);
102
103 setOperationAction(ISD::LOAD, MVT::v12f32, Promote);
104 AddPromotedToType(ISD::LOAD, MVT::v12f32, MVT::v12i32);
105
106 setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
107 AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
108
109 setOperationAction(ISD::LOAD, MVT::v32f32, Promote);
110 AddPromotedToType(ISD::LOAD, MVT::v32f32, MVT::v32i32);
111
113 AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
114
116 AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);
117
119 AddPromotedToType(ISD::LOAD, MVT::f64, MVT::v2i32);
120
122 AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32);
123
125 AddPromotedToType(ISD::LOAD, MVT::v3i64, MVT::v6i32);
126
128 AddPromotedToType(ISD::LOAD, MVT::v4i64, MVT::v8i32);
129
131 AddPromotedToType(ISD::LOAD, MVT::v3f64, MVT::v6i32);
132
134 AddPromotedToType(ISD::LOAD, MVT::v4f64, MVT::v8i32);
135
137 AddPromotedToType(ISD::LOAD, MVT::v8i64, MVT::v16i32);
138
140 AddPromotedToType(ISD::LOAD, MVT::v8f64, MVT::v16i32);
141
142 setOperationAction(ISD::LOAD, MVT::v16i64, Promote);
143 AddPromotedToType(ISD::LOAD, MVT::v16i64, MVT::v32i32);
144
145 setOperationAction(ISD::LOAD, MVT::v16f64, Promote);
146 AddPromotedToType(ISD::LOAD, MVT::v16f64, MVT::v32i32);
147
149 AddPromotedToType(ISD::LOAD, MVT::i128, MVT::v4i32);
150
151 // TODO: Would be better to consume as directly legal
153 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f32, MVT::i32);
154
156 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f64, MVT::i64);
157
159 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f16, MVT::i16);
160
162 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::bf16, MVT::i16);
163
164 // There are no 64-bit extloads. These should be done as a 32-bit extload and
165 // an extension to 64-bit.
166 for (MVT VT : MVT::integer_valuetypes())
168 Expand);
169
170 for (MVT VT : MVT::integer_valuetypes()) {
171 if (VT == MVT::i64)
172 continue;
173
174 for (auto Op : {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}) {
175 setLoadExtAction(Op, VT, MVT::i1, Promote);
176 setLoadExtAction(Op, VT, MVT::i8, Legal);
177 setLoadExtAction(Op, VT, MVT::i16, Legal);
178 setLoadExtAction(Op, VT, MVT::i32, Expand);
179 }
180 }
181
183 for (auto MemVT :
184 {MVT::v2i8, MVT::v4i8, MVT::v2i16, MVT::v3i16, MVT::v4i16})
186 Expand);
187
188 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
189 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand);
190 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
191 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2bf16, Expand);
192 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3f16, Expand);
193 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3bf16, Expand);
194 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
195 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4bf16, Expand);
196 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
197 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8bf16, Expand);
198 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Expand);
199 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16bf16, Expand);
200 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32f16, Expand);
201 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32bf16, Expand);
202
203 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
204 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
205 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f32, Expand);
206 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
207 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand);
208 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f32, Expand);
209
210 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
211 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand);
212 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
213 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2bf16, Expand);
214 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f16, Expand);
215 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3bf16, Expand);
216 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
217 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4bf16, Expand);
218 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
219 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8bf16, Expand);
220 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f16, Expand);
221 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16bf16, Expand);
222
224 AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
225
227 AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
228
230 AddPromotedToType(ISD::STORE, MVT::v3f32, MVT::v3i32);
231
233 AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
234
236 AddPromotedToType(ISD::STORE, MVT::v5f32, MVT::v5i32);
237
239 AddPromotedToType(ISD::STORE, MVT::v6f32, MVT::v6i32);
240
242 AddPromotedToType(ISD::STORE, MVT::v7f32, MVT::v7i32);
243
245 AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
246
248 AddPromotedToType(ISD::STORE, MVT::v9f32, MVT::v9i32);
249
251 AddPromotedToType(ISD::STORE, MVT::v10f32, MVT::v10i32);
252
254 AddPromotedToType(ISD::STORE, MVT::v11f32, MVT::v11i32);
255
257 AddPromotedToType(ISD::STORE, MVT::v12f32, MVT::v12i32);
258
260 AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
261
263 AddPromotedToType(ISD::STORE, MVT::v32f32, MVT::v32i32);
264
266 AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
267
269 AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);
270
272 AddPromotedToType(ISD::STORE, MVT::f64, MVT::v2i32);
273
275 AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
276
278 AddPromotedToType(ISD::STORE, MVT::v3i64, MVT::v6i32);
279
281 AddPromotedToType(ISD::STORE, MVT::v3f64, MVT::v6i32);
282
284 AddPromotedToType(ISD::STORE, MVT::v4i64, MVT::v8i32);
285
287 AddPromotedToType(ISD::STORE, MVT::v4f64, MVT::v8i32);
288
290 AddPromotedToType(ISD::STORE, MVT::v8i64, MVT::v16i32);
291
293 AddPromotedToType(ISD::STORE, MVT::v8f64, MVT::v16i32);
294
296 AddPromotedToType(ISD::STORE, MVT::v16i64, MVT::v32i32);
297
299 AddPromotedToType(ISD::STORE, MVT::v16f64, MVT::v32i32);
300
302 AddPromotedToType(ISD::STORE, MVT::i128, MVT::v4i32);
303
304 setTruncStoreAction(MVT::i64, MVT::i1, Expand);
305 setTruncStoreAction(MVT::i64, MVT::i8, Expand);
306 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
307 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
308
309 setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
310 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Expand);
311 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand);
312 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
313
314 setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
315 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
316 setTruncStoreAction(MVT::v2f32, MVT::v2bf16, Expand);
317 setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
318 setTruncStoreAction(MVT::v3f32, MVT::v3bf16, Expand);
319 setTruncStoreAction(MVT::v3f32, MVT::v3f16, Expand);
320 setTruncStoreAction(MVT::v4f32, MVT::v4bf16, Expand);
321 setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
322 setTruncStoreAction(MVT::v8f32, MVT::v8bf16, Expand);
323 setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
324 setTruncStoreAction(MVT::v16f32, MVT::v16bf16, Expand);
325 setTruncStoreAction(MVT::v16f32, MVT::v16f16, Expand);
326 setTruncStoreAction(MVT::v32f32, MVT::v32bf16, Expand);
327 setTruncStoreAction(MVT::v32f32, MVT::v32f16, Expand);
328
329 setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
330 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
331 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
332
333 setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
334 setTruncStoreAction(MVT::v2f64, MVT::v2bf16, Expand);
335 setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
336
337 setTruncStoreAction(MVT::v3i32, MVT::v3i8, Expand);
338
339 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
340 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
341 setTruncStoreAction(MVT::v3i64, MVT::v3i8, Expand);
342 setTruncStoreAction(MVT::v3i64, MVT::v3i1, Expand);
343 setTruncStoreAction(MVT::v3f64, MVT::v3f32, Expand);
344 setTruncStoreAction(MVT::v3f64, MVT::v3bf16, Expand);
345 setTruncStoreAction(MVT::v3f64, MVT::v3f16, Expand);
346
347 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Expand);
348 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Expand);
349 setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
350 setTruncStoreAction(MVT::v4f64, MVT::v4bf16, Expand);
351 setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);
352
353 setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);
354 setTruncStoreAction(MVT::v8f64, MVT::v8bf16, Expand);
355 setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);
356
357 setTruncStoreAction(MVT::v16f64, MVT::v16f32, Expand);
358 setTruncStoreAction(MVT::v16f64, MVT::v16bf16, Expand);
359 setTruncStoreAction(MVT::v16f64, MVT::v16f16, Expand);
360 setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
361 setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
362 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
363 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
364 setTruncStoreAction(MVT::v16i64, MVT::v16i1, Expand);
365
366 setOperationAction(ISD::Constant, {MVT::i32, MVT::i64}, Legal);
367 setOperationAction(ISD::ConstantFP, {MVT::f32, MVT::f64}, Legal);
368
370
371 // For R600, this is totally unsupported, just custom lower to produce an
372 // error.
374
375 // Library functions. These default to Expand, but we have instructions
376 // for them.
379 MVT::f32, Legal);
380
382 setOperationAction(ISD::FROUND, {MVT::f32, MVT::f64}, Custom);
383
386 Custom);
387
388 setOperationAction(ISD::FNEARBYINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);
389
390 setOperationAction(ISD::FRINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);
391
392 setOperationAction(ISD::FREM, {MVT::f16, MVT::f32, MVT::f64}, Custom);
393
394 if (Subtarget->has16BitInsts())
395 setOperationAction(ISD::IS_FPCLASS, {MVT::f16, MVT::f32, MVT::f64}, Legal);
396 else {
397 setOperationAction(ISD::IS_FPCLASS, {MVT::f32, MVT::f64}, Legal);
399 }
400
402 Custom);
403
404 // FIXME: These IS_FPCLASS vector fp types are marked custom so it reaches
405 // scalarization code. Can be removed when IS_FPCLASS expand isn't called by
406 // default unless marked custom/legal.
409 {MVT::v2f16, MVT::v3f16, MVT::v4f16, MVT::v16f16, MVT::v2f32, MVT::v3f32,
410 MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v16f32,
411 MVT::v2f64, MVT::v3f64, MVT::v4f64, MVT::v8f64, MVT::v16f64},
412 Custom);
413
414 // Expand to fneg + fadd.
416
418 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32,
419 MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
420 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
421 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
422 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
423 Custom);
424
425 // FIXME: Why is v8f16/v8bf16 missing?
428 {MVT::v2f16, MVT::v2bf16, MVT::v2i16, MVT::v4f16, MVT::v4bf16,
429 MVT::v4i16, MVT::v2f32, MVT::v2i32, MVT::v3f32, MVT::v3i32,
430 MVT::v4f32, MVT::v4i32, MVT::v5f32, MVT::v5i32, MVT::v6f32,
431 MVT::v6i32, MVT::v7f32, MVT::v7i32, MVT::v8f32, MVT::v8i32,
432 MVT::v9f32, MVT::v9i32, MVT::v10i32, MVT::v10f32, MVT::v11i32,
433 MVT::v11f32, MVT::v12i32, MVT::v12f32, MVT::v16f16, MVT::v16bf16,
434 MVT::v16i16, MVT::v16f32, MVT::v16i32, MVT::v32f32, MVT::v32i32,
435 MVT::v2f64, MVT::v2i64, MVT::v3f64, MVT::v3i64, MVT::v4f64,
436 MVT::v4i64, MVT::v8f64, MVT::v8i64, MVT::v16f64, MVT::v16i64,
437 MVT::v32i16, MVT::v32f16, MVT::v32bf16},
438 Custom);
439
441 setOperationAction(ISD::FP_TO_FP16, {MVT::f64, MVT::f32}, Custom);
442
443 const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
444 for (MVT VT : ScalarIntVTs) {
445 // These should use [SU]DIVREM, so set them to expand
447 Expand);
448
449 // GPU does not have divrem function for signed or unsigned.
451
452 // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
454
456
457 // AMDGPU uses ADDC/SUBC/ADDE/SUBE
459 }
460
461 // The hardware supports 32-bit FSHR, but not FSHL.
463
464 // The hardware supports 32-bit ROTR, but not ROTL.
465 setOperationAction(ISD::ROTL, {MVT::i32, MVT::i64}, Expand);
467
469
473 MVT::i64, Custom);
475
477 Legal);
478
481 MVT::i64, Custom);
482
483 for (auto VT : {MVT::i8, MVT::i16})
485
486 static const MVT::SimpleValueType VectorIntTypes[] = {
487 MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32,
488 MVT::v9i32, MVT::v10i32, MVT::v11i32, MVT::v12i32};
489
490 for (MVT VT : VectorIntTypes) {
491 // Expand the following operations for the current type by default.
503 ISD::SETCC},
504 VT, Expand);
505 }
506
507 static const MVT::SimpleValueType FloatVectorTypes[] = {
508 MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32,
509 MVT::v9f32, MVT::v10f32, MVT::v11f32, MVT::v12f32};
510
511 for (MVT VT : FloatVectorTypes) {
524 VT, Expand);
525 }
526
527 // This causes using an unrolled select operation rather than expansion with
528 // bit operations. This is in general better, but the alternative using BFI
529 // instructions may be better if the select sources are SGPRs.
531 AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32);
532
534 AddPromotedToType(ISD::SELECT, MVT::v3f32, MVT::v3i32);
535
537 AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32);
538
540 AddPromotedToType(ISD::SELECT, MVT::v5f32, MVT::v5i32);
541
543 AddPromotedToType(ISD::SELECT, MVT::v6f32, MVT::v6i32);
544
546 AddPromotedToType(ISD::SELECT, MVT::v7f32, MVT::v7i32);
547
549 AddPromotedToType(ISD::SELECT, MVT::v9f32, MVT::v9i32);
550
552 AddPromotedToType(ISD::SELECT, MVT::v10f32, MVT::v10i32);
553
555 AddPromotedToType(ISD::SELECT, MVT::v11f32, MVT::v11i32);
556
558 AddPromotedToType(ISD::SELECT, MVT::v12f32, MVT::v12i32);
559
560 // Disable most libcalls.
561 for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I) {
562 if (I < RTLIB::ATOMIC_LOAD || I > RTLIB::ATOMIC_FETCH_NAND_16)
563 setLibcallName(static_cast<RTLIB::Libcall>(I), nullptr);
564 }
565
567 setJumpIsExpensive(true);
568
569 // FIXME: This is only partially true. If we have to do vector compares, any
570 // SGPR pair can be a condition register. If we have a uniform condition, we
571 // are better off doing SALU operations, where there is only one SCC. For now,
572 // we don't have a way of knowing during instruction selection if a condition
573 // will be uniform and we always use vector compares. Assume we are using
574 // vector compares until that is fixed.
576
579
581
582 // We want to find all load dependencies for long chains of stores to enable
583 // merging into very wide vectors. The problem is with vectors with > 4
584 // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
585 // vectors are a legal type, even though we have to split the loads
586 // usually. When we can more precisely specify load legality per address
587 // space, we should be able to make FindBetterChain/MergeConsecutiveStores
588 // smarter so that they can figure out what to do in 2 iterations without all
589 // N > 4 stores on the same chain.
591
592 // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
593 // about these during lowering.
594 MaxStoresPerMemcpy = 0xffffffff;
595 MaxStoresPerMemmove = 0xffffffff;
596 MaxStoresPerMemset = 0xffffffff;
597
598 // The expansion for 64-bit division is enormous.
600 addBypassSlowDiv(64, 32);
601
612
616}
617
619 if (getTargetMachine().Options.NoSignedZerosFPMath)
620 return true;
621
622 const auto Flags = Op.getNode()->getFlags();
623 if (Flags.hasNoSignedZeros())
624 return true;
625
626 return false;
627}
628
629//===----------------------------------------------------------------------===//
630// Target Information
631//===----------------------------------------------------------------------===//
632
634static bool fnegFoldsIntoOpcode(unsigned Opc) {
635 switch (Opc) {
636 case ISD::FADD:
637 case ISD::FSUB:
638 case ISD::FMUL:
639 case ISD::FMA:
640 case ISD::FMAD:
641 case ISD::FMINNUM:
642 case ISD::FMAXNUM:
645 case ISD::FMINIMUM:
646 case ISD::FMAXIMUM:
647 case ISD::SELECT:
648 case ISD::FSIN:
649 case ISD::FTRUNC:
650 case ISD::FRINT:
651 case ISD::FNEARBYINT:
652 case ISD::FROUNDEVEN:
654 case AMDGPUISD::RCP:
661 case AMDGPUISD::FMED3:
662 // TODO: handle llvm.amdgcn.fma.legacy
663 return true;
664 case ISD::BITCAST:
665 llvm_unreachable("bitcast is special cased");
666 default:
667 return false;
668 }
669}
670
671static bool fnegFoldsIntoOp(const SDNode *N) {
672 unsigned Opc = N->getOpcode();
673 if (Opc == ISD::BITCAST) {
674 // TODO: Is there a benefit to checking the conditions performFNegCombine
675 // does? We don't for the other cases.
676 SDValue BCSrc = N->getOperand(0);
677 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
678 return BCSrc.getNumOperands() == 2 &&
679 BCSrc.getOperand(1).getValueSizeInBits() == 32;
680 }
681
682 return BCSrc.getOpcode() == ISD::SELECT && BCSrc.getValueType() == MVT::f32;
683 }
684
685 return fnegFoldsIntoOpcode(Opc);
686}
687
688/// \p returns true if the operation will definitely need to use a 64-bit
689/// encoding, and thus will use a VOP3 encoding regardless of the source
690/// modifiers.
692static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
693 return (N->getNumOperands() > 2 && N->getOpcode() != ISD::SELECT) ||
694 VT == MVT::f64;
695}
696
697/// Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the
698/// type for ISD::SELECT.
700static bool selectSupportsSourceMods(const SDNode *N) {
701 // TODO: Only applies if select will be vector
702 return N->getValueType(0) == MVT::f32;
703}
704
705// Most FP instructions support source modifiers, but this could be refined
706// slightly.
708static bool hasSourceMods(const SDNode *N) {
709 if (isa<MemSDNode>(N))
710 return false;
711
712 switch (N->getOpcode()) {
713 case ISD::CopyToReg:
714 case ISD::FDIV:
715 case ISD::FREM:
716 case ISD::INLINEASM:
720
721 // TODO: Should really be looking at the users of the bitcast. These are
722 // problematic because bitcasts are used to legalize all stores to integer
723 // types.
724 case ISD::BITCAST:
725 return false;
727 switch (N->getConstantOperandVal(0)) {
728 case Intrinsic::amdgcn_interp_p1:
729 case Intrinsic::amdgcn_interp_p2:
730 case Intrinsic::amdgcn_interp_mov:
731 case Intrinsic::amdgcn_interp_p1_f16:
732 case Intrinsic::amdgcn_interp_p2_f16:
733 return false;
734 default:
735 return true;
736 }
737 }
738 case ISD::SELECT:
740 default:
741 return true;
742 }
743}
744
746 unsigned CostThreshold) {
747 // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
748 // it is truly free to use a source modifier in all cases. If there are
749 // multiple users but for each one will necessitate using VOP3, there will be
750 // a code size increase. Try to avoid increasing code size unless we know it
751 // will save on the instruction count.
752 unsigned NumMayIncreaseSize = 0;
753 MVT VT = N->getValueType(0).getScalarType().getSimpleVT();
754
755 assert(!N->use_empty());
756
757 // XXX - Should this limit number of uses to check?
758 for (const SDNode *U : N->uses()) {
759 if (!hasSourceMods(U))
760 return false;
761
762 if (!opMustUseVOP3Encoding(U, VT)) {
763 if (++NumMayIncreaseSize > CostThreshold)
764 return false;
765 }
766 }
767
768 return true;
769}
770
772 ISD::NodeType ExtendKind) const {
773 assert(!VT.isVector() && "only scalar expected");
774
775 // Round to the next multiple of 32-bits.
776 unsigned Size = VT.getSizeInBits();
777 if (Size <= 32)
778 return MVT::i32;
779 return EVT::getIntegerVT(Context, 32 * ((Size + 31) / 32));
780}
781
783 return MVT::i32;
784}
785
787 return true;
788}
789
790// The backend supports 32 and 64 bit floating point immediates.
791// FIXME: Why are we reporting vectors of FP immediates as legal?
793 bool ForCodeSize) const {
794 EVT ScalarVT = VT.getScalarType();
795 return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 ||
796 (ScalarVT == MVT::f16 && Subtarget->has16BitInsts()));
797}
798
799// We don't want to shrink f64 / f32 constants.
801 EVT ScalarVT = VT.getScalarType();
802 return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
803}
804
806 ISD::LoadExtType ExtTy,
807 EVT NewVT) const {
808 // TODO: This may be worth removing. Check regression tests for diffs.
810 return false;
811
812 unsigned NewSize = NewVT.getStoreSizeInBits();
813
814 // If we are reducing to a 32-bit load or a smaller multi-dword load,
815 // this is always better.
816 if (NewSize >= 32)
817 return true;
818
819 EVT OldVT = N->getValueType(0);
820 unsigned OldSize = OldVT.getStoreSizeInBits();
821
822 MemSDNode *MN = cast<MemSDNode>(N);
823 unsigned AS = MN->getAddressSpace();
824 // Do not shrink an aligned scalar load to sub-dword.
825 // Scalar engine cannot do sub-dword loads.
826 // TODO: Update this for GFX12 which does have scalar sub-dword loads.
827 if (OldSize >= 32 && NewSize < 32 && MN->getAlign() >= Align(4) &&
830 (isa<LoadSDNode>(N) && AS == AMDGPUAS::GLOBAL_ADDRESS &&
831 MN->isInvariant())) &&
833 return false;
834
835 // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
836 // extloads, so doing one requires using a buffer_load. In cases where we
837 // still couldn't use a scalar load, using the wider load shouldn't really
838 // hurt anything.
839
840 // If the old size already had to be an extload, there's no harm in continuing
841 // to reduce the width.
842 return (OldSize < 32);
843}
844
846 const SelectionDAG &DAG,
847 const MachineMemOperand &MMO) const {
848
849 assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());
850
851 if (LoadTy.getScalarType() == MVT::i32)
852 return false;
853
854 unsigned LScalarSize = LoadTy.getScalarSizeInBits();
855 unsigned CastScalarSize = CastTy.getScalarSizeInBits();
856
857 if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32))
858 return false;
859
860 unsigned Fast = 0;
862 CastTy, MMO, &Fast) &&
863 Fast;
864}
865
866// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
867// profitable with the expansion for 64-bit since it's generally good to
868// speculate things.
870 return true;
871}
872
874 return true;
875}
876
878 switch (N->getOpcode()) {
879 case ISD::EntryToken:
880 case ISD::TokenFactor:
881 return true;
883 unsigned IntrID = N->getConstantOperandVal(0);
885 }
886 case ISD::LOAD:
887 if (cast<LoadSDNode>(N)->getMemOperand()->getAddrSpace() ==
889 return true;
890 return false;
891 case AMDGPUISD::SETCC: // ballot-style instruction
892 return true;
893 }
894 return false;
895}
896
898 SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize,
899 NegatibleCost &Cost, unsigned Depth) const {
900
901 switch (Op.getOpcode()) {
902 case ISD::FMA:
903 case ISD::FMAD: {
904 // Negating a fma is not free if it has users without source mods.
905 if (!allUsesHaveSourceMods(Op.getNode()))
906 return SDValue();
907 break;
908 }
909 case AMDGPUISD::RCP: {
910 SDValue Src = Op.getOperand(0);
911 EVT VT = Op.getValueType();
912 SDLoc SL(Op);
913
914 SDValue NegSrc = getNegatedExpression(Src, DAG, LegalOperations,
915 ForCodeSize, Cost, Depth + 1);
916 if (NegSrc)
917 return DAG.getNode(AMDGPUISD::RCP, SL, VT, NegSrc, Op->getFlags());
918 return SDValue();
919 }
920 default:
921 break;
922 }
923
924 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
925 ForCodeSize, Cost, Depth);
926}
927
928//===---------------------------------------------------------------------===//
929// Target Properties
930//===---------------------------------------------------------------------===//
931
934
935 // Packed operations do not have a fabs modifier.
936 return VT == MVT::f32 || VT == MVT::f64 ||
937 (Subtarget->has16BitInsts() && VT == MVT::f16);
938}
939
942 // Report this based on the end legalized type.
943 VT = VT.getScalarType();
944 return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16;
945}
946
948 unsigned NumElem,
949 unsigned AS) const {
950 return true;
951}
952
954 // There are few operations which truly have vector input operands. Any vector
955 // operation is going to involve operations on each component, and a
956 // build_vector will be a copy per element, so it always makes sense to use a
957 // build_vector input in place of the extracted element to avoid a copy into a
958 // super register.
959 //
960 // We should probably only do this if all users are extracts only, but this
961 // should be the common case.
962 return true;
963}
964
966 // Truncate is just accessing a subregister.
967
968 unsigned SrcSize = Source.getSizeInBits();
969 unsigned DestSize = Dest.getSizeInBits();
970
971 return DestSize < SrcSize && DestSize % 32 == 0 ;
972}
973
975 // Truncate is just accessing a subregister.
976
977 unsigned SrcSize = Source->getScalarSizeInBits();
978 unsigned DestSize = Dest->getScalarSizeInBits();
979
980 if (DestSize== 16 && Subtarget->has16BitInsts())
981 return SrcSize >= 32;
982
983 return DestSize < SrcSize && DestSize % 32 == 0;
984}
985
987 unsigned SrcSize = Src->getScalarSizeInBits();
988 unsigned DestSize = Dest->getScalarSizeInBits();
989
990 if (SrcSize == 16 && Subtarget->has16BitInsts())
991 return DestSize >= 32;
992
993 return SrcSize == 32 && DestSize == 64;
994}
995
997 // Any register load of a 64-bit value really requires 2 32-bit moves. For all
998 // practical purposes, the extra mov 0 to load a 64-bit is free. As used,
999 // this will enable reducing 64-bit operations the 32-bit, which is always
1000 // good.
1001
1002 if (Src == MVT::i16)
1003 return Dest == MVT::i32 ||Dest == MVT::i64 ;
1004
1005 return Src == MVT::i32 && Dest == MVT::i64;
1006}
1007
1009 // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
1010 // limited number of native 64-bit operations. Shrinking an operation to fit
1011 // in a single 32-bit register should always be helpful. As currently used,
1012 // this is much less general than the name suggests, and is only used in
1013 // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
1014 // not profitable, and may actually be harmful.
1015 return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
1016}
1017
1019 const SDNode* N, CombineLevel Level) const {
1020 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
1021 N->getOpcode() == ISD::SRL) &&
1022 "Expected shift op");
1023 // Always commute pre-type legalization and right shifts.
1024 // We're looking for shl(or(x,y),z) patterns.
1026 N->getOpcode() != ISD::SHL || N->getOperand(0).getOpcode() != ISD::OR)
1027 return true;
1028
1029 // If only user is a i32 right-shift, then don't destroy a BFE pattern.
1030 if (N->getValueType(0) == MVT::i32 && N->use_size() == 1 &&
1031 (N->use_begin()->getOpcode() == ISD::SRA ||
1032 N->use_begin()->getOpcode() == ISD::SRL))
1033 return false;
1034
1035 // Don't destroy or(shl(load_zext(),c), load_zext()) patterns.
1036 auto IsShiftAndLoad = [](SDValue LHS, SDValue RHS) {
1037 if (LHS.getOpcode() != ISD::SHL)
1038 return false;
1039 auto *RHSLd = dyn_cast<LoadSDNode>(RHS);
1040 auto *LHS0 = dyn_cast<LoadSDNode>(LHS.getOperand(0));
1041 auto *LHS1 = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
1042 return LHS0 && LHS1 && RHSLd && LHS0->getExtensionType() == ISD::ZEXTLOAD &&
1043 LHS1->getAPIntValue() == LHS0->getMemoryVT().getScalarSizeInBits() &&
1044 RHSLd->getExtensionType() == ISD::ZEXTLOAD;
1045 };
1046 SDValue LHS = N->getOperand(0).getOperand(0);
1047 SDValue RHS = N->getOperand(0).getOperand(1);
1048 return !(IsShiftAndLoad(LHS, RHS) || IsShiftAndLoad(RHS, LHS));
1049}
1050
1051//===---------------------------------------------------------------------===//
1052// TargetLowering Callbacks
1053//===---------------------------------------------------------------------===//
1054
1056 bool IsVarArg) {
1057 switch (CC) {
1065 return CC_AMDGPU;
1068 return CC_AMDGPU_CS_CHAIN;
1069 case CallingConv::C:
1070 case CallingConv::Fast:
1071 case CallingConv::Cold:
1072 return CC_AMDGPU_Func;
1074 return CC_SI_Gfx;
1077 default:
1078 report_fatal_error("Unsupported calling convention for call");
1079 }
1080}
1081
1083 bool IsVarArg) {
1084 switch (CC) {
1087 llvm_unreachable("kernels should not be handled here");
1097 return RetCC_SI_Shader;
1099 return RetCC_SI_Gfx;
1100 case CallingConv::C:
1101 case CallingConv::Fast:
1102 case CallingConv::Cold:
1103 return RetCC_AMDGPU_Func;
1104 default:
1105 report_fatal_error("Unsupported calling convention.");
1106 }
1107}
1108
1109/// The SelectionDAGBuilder will automatically promote function arguments
1110/// with illegal types. However, this does not work for the AMDGPU targets
1111/// since the function arguments are stored in memory as these illegal types.
1112/// In order to handle this properly we need to get the original types sizes
1113/// from the LLVM IR Function and fixup the ISD:InputArg values before
1114/// passing them to AnalyzeFormalArguments()
1115
1116/// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
1117/// input values across multiple registers. Each item in the Ins array
1118/// represents a single value that will be stored in registers. Ins[x].VT is
1119/// the value type of the value that will be stored in the register, so
1120/// whatever SDNode we lower the argument to needs to be this type.
1121///
1122/// In order to correctly lower the arguments we need to know the size of each
1123/// argument. Since Ins[x].VT gives us the size of the register that will
1124/// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
1125/// for the original function argument so that we can deduce the correct memory
1126/// type to use for Ins[x]. In most cases the correct memory type will be
1127/// Ins[x].ArgVT. However, this will not always be the case. If, for example,
1128/// we have a kernel argument of type v8i8, this argument will be split into
1129/// 8 parts and each part will be represented by its own item in the Ins array.
1130/// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
1131/// the argument before it was split. From this, we deduce that the memory type
1132/// for each individual part is i8. We pass the memory type as LocVT to the
1133/// calling convention analysis function and the register type (Ins[x].VT) as
1134/// the ValVT.
1136 CCState &State,
1137 const SmallVectorImpl<ISD::InputArg> &Ins) const {
1138 const MachineFunction &MF = State.getMachineFunction();
1139 const Function &Fn = MF.getFunction();
1140 LLVMContext &Ctx = Fn.getParent()->getContext();
1141 const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF);
1142 const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset();
1144
1145 Align MaxAlign = Align(1);
1146 uint64_t ExplicitArgOffset = 0;
1147 const DataLayout &DL = Fn.getParent()->getDataLayout();
1148
1149 unsigned InIndex = 0;
1150
1151 for (const Argument &Arg : Fn.args()) {
1152 const bool IsByRef = Arg.hasByRefAttr();
1153 Type *BaseArgTy = Arg.getType();
1154 Type *MemArgTy = IsByRef ? Arg.getParamByRefType() : BaseArgTy;
1155 Align Alignment = DL.getValueOrABITypeAlignment(
1156 IsByRef ? Arg.getParamAlign() : std::nullopt, MemArgTy);
1157 MaxAlign = std::max(Alignment, MaxAlign);
1158 uint64_t AllocSize = DL.getTypeAllocSize(MemArgTy);
1159
1160 uint64_t ArgOffset = alignTo(ExplicitArgOffset, Alignment) + ExplicitOffset;
1161 ExplicitArgOffset = alignTo(ExplicitArgOffset, Alignment) + AllocSize;
1162
1163 // We're basically throwing away everything passed into us and starting over
1164 // to get accurate in-memory offsets. The "PartOffset" is completely useless
1165 // to us as computed in Ins.
1166 //
1167 // We also need to figure out what type legalization is trying to do to get
1168 // the correct memory offsets.
1169
1170 SmallVector<EVT, 16> ValueVTs;
1172 ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, &Offsets, ArgOffset);
1173
1174 for (unsigned Value = 0, NumValues = ValueVTs.size();
1175 Value != NumValues; ++Value) {
1176 uint64_t BasePartOffset = Offsets[Value];
1177
1178 EVT ArgVT = ValueVTs[Value];
1179 EVT MemVT = ArgVT;
1180 MVT RegisterVT = getRegisterTypeForCallingConv(Ctx, CC, ArgVT);
1181 unsigned NumRegs = getNumRegistersForCallingConv(Ctx, CC, ArgVT);
1182
1183 if (NumRegs == 1) {
1184 // This argument is not split, so the IR type is the memory type.
1185 if (ArgVT.isExtended()) {
1186 // We have an extended type, like i24, so we should just use the
1187 // register type.
1188 MemVT = RegisterVT;
1189 } else {
1190 MemVT = ArgVT;
1191 }
1192 } else if (ArgVT.isVector() && RegisterVT.isVector() &&
1193 ArgVT.getScalarType() == RegisterVT.getScalarType()) {
1194 assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements());
1195 // We have a vector value which has been split into a vector with
1196 // the same scalar type, but fewer elements. This should handle
1197 // all the floating-point vector types.
1198 MemVT = RegisterVT;
1199 } else if (ArgVT.isVector() &&
1200 ArgVT.getVectorNumElements() == NumRegs) {
1201 // This arg has been split so that each element is stored in a separate
1202 // register.
1203 MemVT = ArgVT.getScalarType();
1204 } else if (ArgVT.isExtended()) {
1205 // We have an extended type, like i65.
1206 MemVT = RegisterVT;
1207 } else {
1208 unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs;
1209 assert(ArgVT.getStoreSizeInBits() % NumRegs == 0);
1210 if (RegisterVT.isInteger()) {
1211 MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
1212 } else if (RegisterVT.isVector()) {
1213 assert(!RegisterVT.getScalarType().isFloatingPoint());
1214 unsigned NumElements = RegisterVT.getVectorNumElements();
1215 assert(MemoryBits % NumElements == 0);
1216 // This vector type has been split into another vector type with
1217 // a different elements size.
1218 EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
1219 MemoryBits / NumElements);
1220 MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
1221 } else {
1222 llvm_unreachable("cannot deduce memory type.");
1223 }
1224 }
1225
1226 // Convert one element vectors to scalar.
1227 if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
1228 MemVT = MemVT.getScalarType();
1229
1230 // Round up vec3/vec5 argument.
1231 if (MemVT.isVector() && !MemVT.isPow2VectorType()) {
1232 assert(MemVT.getVectorNumElements() == 3 ||
1233 MemVT.getVectorNumElements() == 5 ||
1234 (MemVT.getVectorNumElements() >= 9 &&
1235 MemVT.getVectorNumElements() <= 12));
1236 MemVT = MemVT.getPow2VectorType(State.getContext());
1237 } else if (!MemVT.isSimple() && !MemVT.isVector()) {
1238 MemVT = MemVT.getRoundIntegerType(State.getContext());
1239 }
1240
1241 unsigned PartOffset = 0;
1242 for (unsigned i = 0; i != NumRegs; ++i) {
1243 State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT,
1244 BasePartOffset + PartOffset,
1245 MemVT.getSimpleVT(),
1247 PartOffset += MemVT.getStoreSize();
1248 }
1249 }
1250 }
1251}
1252
1254 SDValue Chain, CallingConv::ID CallConv,
1255 bool isVarArg,
1257 const SmallVectorImpl<SDValue> &OutVals,
1258 const SDLoc &DL, SelectionDAG &DAG) const {
1259 // FIXME: Fails for r600 tests
1260 //assert(!isVarArg && Outs.empty() && OutVals.empty() &&
1261 // "wave terminate should not have return values");
1262 return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
1263}
1264
1265//===---------------------------------------------------------------------===//
1266// Target specific lowering
1267//===---------------------------------------------------------------------===//
1268
1269/// Selects the correct CCAssignFn for a given CallingConvention value.
1271 bool IsVarArg) {
1272 return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg);
1273}
1274
1276 bool IsVarArg) {
1278}
1279
1281 SelectionDAG &DAG,
1282 MachineFrameInfo &MFI,
1283 int ClobberedFI) const {
1284 SmallVector<SDValue, 8> ArgChains;
1285 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
1286 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
1287
1288 // Include the original chain at the beginning of the list. When this is
1289 // used by target LowerCall hooks, this helps legalize find the
1290 // CALLSEQ_BEGIN node.
1291 ArgChains.push_back(Chain);
1292
1293 // Add a chain value for each stack argument corresponding
1294 for (SDNode *U : DAG.getEntryNode().getNode()->uses()) {
1295 if (LoadSDNode *L = dyn_cast<LoadSDNode>(U)) {
1296 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) {
1297 if (FI->getIndex() < 0) {
1298 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
1299 int64_t InLastByte = InFirstByte;
1300 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
1301
1302 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
1303 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
1304 ArgChains.push_back(SDValue(L, 1));
1305 }
1306 }
1307 }
1308 }
1309
1310 // Build a tokenfactor for all the chains.
1311 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
1312}
1313
1316 StringRef Reason) const {
1317 SDValue Callee = CLI.Callee;
1318 SelectionDAG &DAG = CLI.DAG;
1319
1320 const Function &Fn = DAG.getMachineFunction().getFunction();
1321
1322 StringRef FuncName("<unknown>");
1323
1324 if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee))
1325 FuncName = G->getSymbol();
1326 else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
1327 FuncName = G->getGlobal()->getName();
1328
1330 Fn, Reason + FuncName, CLI.DL.getDebugLoc());
1331 DAG.getContext()->diagnose(NoCalls);
1332
1333 if (!CLI.IsTailCall) {
1334 for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
1335 InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT));
1336 }
1337
1338 return DAG.getEntryNode();
1339}
1340
1342 SmallVectorImpl<SDValue> &InVals) const {
1343 return lowerUnhandledCall(CLI, InVals, "unsupported call to function ");
1344}
1345
1347 SelectionDAG &DAG) const {
1348 const Function &Fn = DAG.getMachineFunction().getFunction();
1349
1350 DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "unsupported dynamic alloca",
1351 SDLoc(Op).getDebugLoc());
1352 DAG.getContext()->diagnose(NoDynamicAlloca);
1353 auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
1354 return DAG.getMergeValues(Ops, SDLoc());
1355}
1356
1358 SelectionDAG &DAG) const {
1359 switch (Op.getOpcode()) {
1360 default:
1361 Op->print(errs(), &DAG);
1362 llvm_unreachable("Custom lowering code for this "
1363 "instruction is not implemented yet!");
1364 break;
1366 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
1368 case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
1369 case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
1370 case ISD::FREM: return LowerFREM(Op, DAG);
1371 case ISD::FCEIL: return LowerFCEIL(Op, DAG);
1372 case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
1373 case ISD::FRINT: return LowerFRINT(Op, DAG);
1374 case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
1375 case ISD::FROUNDEVEN:
1376 return LowerFROUNDEVEN(Op, DAG);
1377 case ISD::FROUND: return LowerFROUND(Op, DAG);
1378 case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
1379 case ISD::FLOG2:
1380 return LowerFLOG2(Op, DAG);
1381 case ISD::FLOG:
1382 case ISD::FLOG10:
1383 return LowerFLOGCommon(Op, DAG);
1384 case ISD::FEXP:
1385 case ISD::FEXP10:
1386 return lowerFEXP(Op, DAG);
1387 case ISD::FEXP2:
1388 return lowerFEXP2(Op, DAG);
1389 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
1390 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
1391 case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
1392 case ISD::FP_TO_SINT:
1393 case ISD::FP_TO_UINT:
1394 return LowerFP_TO_INT(Op, DAG);
1395 case ISD::CTTZ:
1397 case ISD::CTLZ:
1399 return LowerCTLZ_CTTZ(Op, DAG);
1401 }
1402 return Op;
1403}
1404
1407 SelectionDAG &DAG) const {
1408 switch (N->getOpcode()) {
1410 // Different parts of legalization seem to interpret which type of
1411 // sign_extend_inreg is the one to check for custom lowering. The extended
1412 // from type is what really matters, but some places check for custom
1413 // lowering of the result type. This results in trying to use
1414 // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
1415 // nothing here and let the illegal result integer be handled normally.
1416 return;
1417 case ISD::FLOG2:
1418 if (SDValue Lowered = LowerFLOG2(SDValue(N, 0), DAG))
1419 Results.push_back(Lowered);
1420 return;
1421 case ISD::FLOG:
1422 case ISD::FLOG10:
1423 if (SDValue Lowered = LowerFLOGCommon(SDValue(N, 0), DAG))
1424 Results.push_back(Lowered);
1425 return;
1426 case ISD::FEXP2:
1427 if (SDValue Lowered = lowerFEXP2(SDValue(N, 0), DAG))
1428 Results.push_back(Lowered);
1429 return;
1430 case ISD::FEXP:
1431 case ISD::FEXP10:
1432 if (SDValue Lowered = lowerFEXP(SDValue(N, 0), DAG))
1433 Results.push_back(Lowered);
1434 return;
1435 case ISD::CTLZ:
1437 if (auto Lowered = lowerCTLZResults(SDValue(N, 0u), DAG))
1438 Results.push_back(Lowered);
1439 return;
1440 default:
1441 return;
1442 }
1443}
1444
1446 SDValue Op,
1447 SelectionDAG &DAG) const {
1448
1449 const DataLayout &DL = DAG.getDataLayout();
1450 GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
1451 const GlobalValue *GV = G->getGlobal();
1452
1453 if (!MFI->isModuleEntryFunction()) {
1454 if (std::optional<uint32_t> Address =
1456 return DAG.getConstant(*Address, SDLoc(Op), Op.getValueType());
1457 }
1458 }
1459
1460 if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
1461 G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
1462 if (!MFI->isModuleEntryFunction() &&
1463 !GV->getName().equals("llvm.amdgcn.module.lds")) {
1464 SDLoc DL(Op);
1465 const Function &Fn = DAG.getMachineFunction().getFunction();
1466 DiagnosticInfoUnsupported BadLDSDecl(
1467 Fn, "local memory global used by non-kernel function",
1468 DL.getDebugLoc(), DS_Warning);
1469 DAG.getContext()->diagnose(BadLDSDecl);
1470
1471 // We currently don't have a way to correctly allocate LDS objects that
1472 // aren't directly associated with a kernel. We do force inlining of
1473 // functions that use local objects. However, if these dead functions are
1474 // not eliminated, we don't want a compile time error. Just emit a warning
1475 // and a trap, since there should be no callable path here.
1476 SDValue Trap = DAG.getNode(ISD::TRAP, DL, MVT::Other, DAG.getEntryNode());
1477 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
1478 Trap, DAG.getRoot());
1479 DAG.setRoot(OutputChain);
1480 return DAG.getUNDEF(Op.getValueType());
1481 }
1482
1483 // XXX: What does the value of G->getOffset() mean?
1484 assert(G->getOffset() == 0 &&
1485 "Do not know what to do with an non-zero offset");
1486
1487 // TODO: We could emit code to handle the initialization somewhere.
1488 // We ignore the initializer for now and legalize it to allow selection.
1489 // The initializer will anyway get errored out during assembly emission.
1490 unsigned Offset = MFI->allocateLDSGlobal(DL, *cast<GlobalVariable>(GV));
1491 return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
1492 }
1493 return SDValue();
1494}
1495
1497 SelectionDAG &DAG) const {
1499 SDLoc SL(Op);
1500
1501 EVT VT = Op.getValueType();
1502 if (VT.getVectorElementType().getSizeInBits() < 32) {
1503 unsigned OpBitSize = Op.getOperand(0).getValueType().getSizeInBits();
1504 if (OpBitSize >= 32 && OpBitSize % 32 == 0) {
1505 unsigned NewNumElt = OpBitSize / 32;
1506 EVT NewEltVT = (NewNumElt == 1) ? MVT::i32
1508 MVT::i32, NewNumElt);
1509 for (const SDUse &U : Op->ops()) {
1510 SDValue In = U.get();
1511 SDValue NewIn = DAG.getNode(ISD::BITCAST, SL, NewEltVT, In);
1512 if (NewNumElt > 1)
1513 DAG.ExtractVectorElements(NewIn, Args);
1514 else
1515 Args.push_back(NewIn);
1516 }
1517
1518 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
1519 NewNumElt * Op.getNumOperands());
1520 SDValue BV = DAG.getBuildVector(NewVT, SL, Args);
1521 return DAG.getNode(ISD::BITCAST, SL, VT, BV);
1522 }
1523 }
1524
1525 for (const SDUse &U : Op->ops())
1526 DAG.ExtractVectorElements(U.get(), Args);
1527
1528 return DAG.getBuildVector(Op.getValueType(), SL, Args);
1529}
1530
1532 SelectionDAG &DAG) const {
1533 SDLoc SL(Op);
1535 unsigned Start = Op.getConstantOperandVal(1);
1536 EVT VT = Op.getValueType();
1537 EVT SrcVT = Op.getOperand(0).getValueType();
1538
1539 if (VT.getScalarSizeInBits() == 16 && Start % 2 == 0) {
1540 unsigned NumElt = VT.getVectorNumElements();
1541 unsigned NumSrcElt = SrcVT.getVectorNumElements();
1542 assert(NumElt % 2 == 0 && NumSrcElt % 2 == 0 && "expect legal types");
1543
1544 // Extract 32-bit registers at a time.
1545 EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumSrcElt / 2);
1546 EVT NewVT = NumElt == 2
1547 ? MVT::i32
1548 : EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElt / 2);
1549 SDValue Tmp = DAG.getNode(ISD::BITCAST, SL, NewSrcVT, Op.getOperand(0));
1550
1551 DAG.ExtractVectorElements(Tmp, Args, Start / 2, NumElt / 2);
1552 if (NumElt == 2)
1553 Tmp = Args[0];
1554 else
1555 Tmp = DAG.getBuildVector(NewVT, SL, Args);
1556
1557 return DAG.getNode(ISD::BITCAST, SL, VT, Tmp);
1558 }
1559
1560 DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
1562
1563 return DAG.getBuildVector(Op.getValueType(), SL, Args);
1564}
1565
1566// TODO: Handle fabs too
1568 if (Val.getOpcode() == ISD::FNEG)
1569 return Val.getOperand(0);
1570
1571 return Val;
1572}
1573
1575 if (Val.getOpcode() == ISD::FNEG)
1576 Val = Val.getOperand(0);
1577 if (Val.getOpcode() == ISD::FABS)
1578 Val = Val.getOperand(0);
1579 if (Val.getOpcode() == ISD::FCOPYSIGN)
1580 Val = Val.getOperand(0);
1581 return Val;
1582}
1583
1585 const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True,
1586 SDValue False, SDValue CC, DAGCombinerInfo &DCI) const {
1587 SelectionDAG &DAG = DCI.DAG;
1588 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1589 switch (CCOpcode) {
1590 case ISD::SETOEQ:
1591 case ISD::SETONE:
1592 case ISD::SETUNE:
1593 case ISD::SETNE:
1594 case ISD::SETUEQ:
1595 case ISD::SETEQ:
1596 case ISD::SETFALSE:
1597 case ISD::SETFALSE2:
1598 case ISD::SETTRUE:
1599 case ISD::SETTRUE2:
1600 case ISD::SETUO:
1601 case ISD::SETO:
1602 break;
1603 case ISD::SETULE:
1604 case ISD::SETULT: {
1605 if (LHS == True)
1606 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1607 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1608 }
1609 case ISD::SETOLE:
1610 case ISD::SETOLT:
1611 case ISD::SETLE:
1612 case ISD::SETLT: {
1613 // Ordered. Assume ordered for undefined.
1614
1615 // Only do this after legalization to avoid interfering with other combines
1616 // which might occur.
1618 !DCI.isCalledByLegalizer())
1619 return SDValue();
1620
1621 // We need to permute the operands to get the correct NaN behavior. The
1622 // selected operand is the second one based on the failing compare with NaN,
1623 // so permute it based on the compare type the hardware uses.
1624 if (LHS == True)
1625 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1626 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1627 }
1628 case ISD::SETUGE:
1629 case ISD::SETUGT: {
1630 if (LHS == True)
1631 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1632 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1633 }
1634 case ISD::SETGT:
1635 case ISD::SETGE:
1636 case ISD::SETOGE:
1637 case ISD::SETOGT: {
1639 !DCI.isCalledByLegalizer())
1640 return SDValue();
1641
1642 if (LHS == True)
1643 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1644 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1645 }
1646 case ISD::SETCC_INVALID:
1647 llvm_unreachable("Invalid setcc condcode!");
1648 }
1649 return SDValue();
1650}
1651
1652/// Generate Min/Max node
1654 SDValue LHS, SDValue RHS,
1655 SDValue True, SDValue False,
1656 SDValue CC,
1657 DAGCombinerInfo &DCI) const {
1658 if ((LHS == True && RHS == False) || (LHS == False && RHS == True))
1659 return combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, True, False, CC, DCI);
1660
1661 SelectionDAG &DAG = DCI.DAG;
1662
1663 // If we can't directly match this, try to see if we can fold an fneg to
1664 // match.
1665
1666 ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
1667 ConstantFPSDNode *CFalse = dyn_cast<ConstantFPSDNode>(False);
1668 SDValue NegTrue = peekFNeg(True);
1669
1670 // Undo the combine foldFreeOpFromSelect does if it helps us match the
1671 // fmin/fmax.
1672 //
1673 // select (fcmp olt (lhs, K)), (fneg lhs), -K
1674 // -> fneg (fmin_legacy lhs, K)
1675 //
1676 // TODO: Use getNegatedExpression
1677 if (LHS == NegTrue && CFalse && CRHS) {
1678 APFloat NegRHS = neg(CRHS->getValueAPF());
1679 if (NegRHS == CFalse->getValueAPF()) {
1680 SDValue Combined =
1681 combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, NegTrue, False, CC, DCI);
1682 if (Combined)
1683 return DAG.getNode(ISD::FNEG, DL, VT, Combined);
1684 return SDValue();
1685 }
1686 }
1687
1688 return SDValue();
1689}
1690
1691std::pair<SDValue, SDValue>
1693 SDLoc SL(Op);
1694
1695 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1696
1697 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1698 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1699
1700 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1701 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1702
1703 return std::pair(Lo, Hi);
1704}
1705
1707 SDLoc SL(Op);
1708
1709 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1710 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1711 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1712}
1713
1715 SDLoc SL(Op);
1716
1717 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1718 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1719 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1720}
1721
1722// Split a vector type into two parts. The first part is a power of two vector.
1723// The second part is whatever is left over, and is a scalar if it would
1724// otherwise be a 1-vector.
1725std::pair<EVT, EVT>
1727 EVT LoVT, HiVT;
1728 EVT EltVT = VT.getVectorElementType();
1729 unsigned NumElts = VT.getVectorNumElements();
1730 unsigned LoNumElts = PowerOf2Ceil((NumElts + 1) / 2);
1731 LoVT = EVT::getVectorVT(*DAG.getContext(), EltVT, LoNumElts);
1732 HiVT = NumElts - LoNumElts == 1
1733 ? EltVT
1734 : EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts - LoNumElts);
1735 return std::pair(LoVT, HiVT);
1736}
1737
1738// Split a vector value into two parts of types LoVT and HiVT. HiVT could be
1739// scalar.
1740std::pair<SDValue, SDValue>
1742 const EVT &LoVT, const EVT &HiVT,
1743 SelectionDAG &DAG) const {
1745 (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <=
1746 N.getValueType().getVectorNumElements() &&
1747 "More vector elements requested than available!");
1749 DAG.getVectorIdxConstant(0, DL));
1750 SDValue Hi = DAG.getNode(
1752 HiVT, N, DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), DL));
1753 return std::pair(Lo, Hi);
1754}
1755
1757 SelectionDAG &DAG) const {
1758 LoadSDNode *Load = cast<LoadSDNode>(Op);
1759 EVT VT = Op.getValueType();
1760 SDLoc SL(Op);
1761
1762
1763 // If this is a 2 element vector, we really want to scalarize and not create
1764 // weird 1 element vectors.
1765 if (VT.getVectorNumElements() == 2) {
1766 SDValue Ops[2];
1767 std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG);
1768 return DAG.getMergeValues(Ops, SL);
1769 }
1770
1771 SDValue BasePtr = Load->getBasePtr();
1772 EVT MemVT = Load->getMemoryVT();
1773
1774 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1775
1776 EVT LoVT, HiVT;
1777 EVT LoMemVT, HiMemVT;
1778 SDValue Lo, Hi;
1779
1780 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1781 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1782 std::tie(Lo, Hi) = splitVector(Op, SL, LoVT, HiVT, DAG);
1783
1784 unsigned Size = LoMemVT.getStoreSize();
1785 Align BaseAlign = Load->getAlign();
1786 Align HiAlign = commonAlignment(BaseAlign, Size);
1787
1788 SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
1789 Load->getChain(), BasePtr, SrcValue, LoMemVT,
1790 BaseAlign, Load->getMemOperand()->getFlags());
1791 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Size));
1792 SDValue HiLoad =
1793 DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(),
1794 HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()),
1795 HiMemVT, HiAlign, Load->getMemOperand()->getFlags());
1796
1797 SDValue Join;
1798 if (LoVT == HiVT) {
1799 // This is the case that the vector is power of two so was evenly split.
1800 Join = DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad);
1801 } else {
1802 Join = DAG.getNode(ISD::INSERT_SUBVECTOR, SL, VT, DAG.getUNDEF(VT), LoLoad,
1803 DAG.getVectorIdxConstant(0, SL));
1804 Join = DAG.getNode(
1806 VT, Join, HiLoad,
1808 }
1809
1810 SDValue Ops[] = {Join, DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
1811 LoLoad.getValue(1), HiLoad.getValue(1))};
1812
1813 return DAG.getMergeValues(Ops, SL);
1814}
1815
1817 SelectionDAG &DAG) const {
1818 LoadSDNode *Load = cast<LoadSDNode>(Op);
1819 EVT VT = Op.getValueType();
1820 SDValue BasePtr = Load->getBasePtr();
1821 EVT MemVT = Load->getMemoryVT();
1822 SDLoc SL(Op);
1823 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1824 Align BaseAlign = Load->getAlign();
1825 unsigned NumElements = MemVT.getVectorNumElements();
1826
1827 // Widen from vec3 to vec4 when the load is at least 8-byte aligned
1828 // or 16-byte fully dereferenceable. Otherwise, split the vector load.
1829 if (NumElements != 3 ||
1830 (BaseAlign < Align(8) &&
1831 !SrcValue.isDereferenceable(16, *DAG.getContext(), DAG.getDataLayout())))
1832 return SplitVectorLoad(Op, DAG);
1833
1834 assert(NumElements == 3);
1835
1836 EVT WideVT =
1838 EVT WideMemVT =
1840 SDValue WideLoad = DAG.getExtLoad(
1841 Load->getExtensionType(), SL, WideVT, Load->getChain(), BasePtr, SrcValue,
1842 WideMemVT, BaseAlign, Load->getMemOperand()->getFlags());
1843 return DAG.getMergeValues(
1844 {DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, VT, WideLoad,
1845 DAG.getVectorIdxConstant(0, SL)),
1846 WideLoad.getValue(1)},
1847 SL);
1848}
1849
1851 SelectionDAG &DAG) const {
1852 StoreSDNode *Store = cast<StoreSDNode>(Op);
1853 SDValue Val = Store->getValue();
1854 EVT VT = Val.getValueType();
1855
1856 // If this is a 2 element vector, we really want to scalarize and not create
1857 // weird 1 element vectors.
1858 if (VT.getVectorNumElements() == 2)
1859 return scalarizeVectorStore(Store, DAG);
1860
1861 EVT MemVT = Store->getMemoryVT();
1862 SDValue Chain = Store->getChain();
1863 SDValue BasePtr = Store->getBasePtr();
1864 SDLoc SL(Op);
1865
1866 EVT LoVT, HiVT;
1867 EVT LoMemVT, HiMemVT;
1868 SDValue Lo, Hi;
1869
1870 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1871 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1872 std::tie(Lo, Hi) = splitVector(Val, SL, LoVT, HiVT, DAG);
1873
1874 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize());
1875
1876 const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
1877 Align BaseAlign = Store->getAlign();
1878 unsigned Size = LoMemVT.getStoreSize();
1879 Align HiAlign = commonAlignment(BaseAlign, Size);
1880
1881 SDValue LoStore =
1882 DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
1883 Store->getMemOperand()->getFlags());
1884 SDValue HiStore =
1885 DAG.getTruncStore(Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size),
1886 HiMemVT, HiAlign, Store->getMemOperand()->getFlags());
1887
1888 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
1889}
1890
1891// This is a shortcut for integer division because we have fast i32<->f32
1892// conversions, and fast f32 reciprocal instructions. The fractional part of a
1893// float is enough to accurately represent up to a 24-bit signed integer.
1895 bool Sign) const {
1896 SDLoc DL(Op);
1897 EVT VT = Op.getValueType();
1898 SDValue LHS = Op.getOperand(0);
1899 SDValue RHS = Op.getOperand(1);
1900 MVT IntVT = MVT::i32;
1901 MVT FltVT = MVT::f32;
1902
1903 unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS);
1904 if (LHSSignBits < 9)
1905 return SDValue();
1906
1907 unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS);
1908 if (RHSSignBits < 9)
1909 return SDValue();
1910
1911 unsigned BitSize = VT.getSizeInBits();
1912 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1913 unsigned DivBits = BitSize - SignBits;
1914 if (Sign)
1915 ++DivBits;
1916
1919
1920 SDValue jq = DAG.getConstant(1, DL, IntVT);
1921
1922 if (Sign) {
1923 // char|short jq = ia ^ ib;
1924 jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
1925
1926 // jq = jq >> (bitsize - 2)
1927 jq = DAG.getNode(ISD::SRA, DL, VT, jq,
1928 DAG.getConstant(BitSize - 2, DL, VT));
1929
1930 // jq = jq | 0x1
1931 jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));
1932 }
1933
1934 // int ia = (int)LHS;
1935 SDValue ia = LHS;
1936
1937 // int ib, (int)RHS;
1938 SDValue ib = RHS;
1939
1940 // float fa = (float)ia;
1941 SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
1942
1943 // float fb = (float)ib;
1944 SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
1945
1946 SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
1947 fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
1948
1949 // fq = trunc(fq);
1950 fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);
1951
1952 // float fqneg = -fq;
1953 SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
1954
1956
1957 bool UseFmadFtz = false;
1958 if (Subtarget->isGCN()) {
1960 UseFmadFtz =
1962 }
1963
1964 // float fr = mad(fqneg, fb, fa);
1965 unsigned OpCode = !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
1966 : UseFmadFtz ? (unsigned)AMDGPUISD::FMAD_FTZ
1968 SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa);
1969
1970 // int iq = (int)fq;
1971 SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
1972
1973 // fr = fabs(fr);
1974 fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
1975
1976 // fb = fabs(fb);
1977 fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
1978
1979 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
1980
1981 // int cv = fr >= fb;
1982 SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
1983
1984 // jq = (cv ? jq : 0);
1985 jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));
1986
1987 // dst = iq + jq;
1988 SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
1989
1990 // Rem needs compensation, it's easier to recompute it
1991 SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
1992 Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
1993
1994 // Truncate to number of bits this divide really is.
1995 if (Sign) {
1996 SDValue InRegSize
1997 = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits));
1998 Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize);
1999 Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize);
2000 } else {
2001 SDValue TruncMask = DAG.getConstant((UINT64_C(1) << DivBits) - 1, DL, VT);
2002 Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask);
2003 Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask);
2004 }
2005
2006 return DAG.getMergeValues({ Div, Rem }, DL);
2007}
2008
2010 SelectionDAG &DAG,
2012 SDLoc DL(Op);
2013 EVT VT = Op.getValueType();
2014
2015 assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64");
2016
2017 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
2018
2019 SDValue One = DAG.getConstant(1, DL, HalfVT);
2020 SDValue Zero = DAG.getConstant(0, DL, HalfVT);
2021
2022 //HiLo split
2023 SDValue LHS_Lo, LHS_Hi;
2024 SDValue LHS = Op.getOperand(0);
2025 std::tie(LHS_Lo, LHS_Hi) = DAG.SplitScalar(LHS, DL, HalfVT, HalfVT);
2026
2027 SDValue RHS_Lo, RHS_Hi;
2028 SDValue RHS = Op.getOperand(1);
2029 std::tie(RHS_Lo, RHS_Hi) = DAG.SplitScalar(RHS, DL, HalfVT, HalfVT);
2030
2031 if (DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
2033
2034 SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2035 LHS_Lo, RHS_Lo);
2036
2037 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), Zero});
2038 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), Zero});
2039
2040 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));
2041 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));
2042 return;
2043 }
2044
2045 if (isTypeLegal(MVT::i64)) {
2046 // The algorithm here is based on ideas from "Software Integer Division",
2047 // Tom Rodeheffer, August 2008.
2048
2051
2052 // Compute denominator reciprocal.
2053 unsigned FMAD =
2054 !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
2057 : (unsigned)AMDGPUISD::FMAD_FTZ;
2058
2059 SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo);
2060 SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi);
2061 SDValue Mad1 = DAG.getNode(FMAD, DL, MVT::f32, Cvt_Hi,
2062 DAG.getConstantFP(APInt(32, 0x4f800000).bitsToFloat(), DL, MVT::f32),
2063 Cvt_Lo);
2064 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, DL, MVT::f32, Mad1);
2065 SDValue Mul1 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Rcp,
2066 DAG.getConstantFP(APInt(32, 0x5f7ffffc).bitsToFloat(), DL, MVT::f32));
2067 SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Mul1,
2068 DAG.getConstantFP(APInt(32, 0x2f800000).bitsToFloat(), DL, MVT::f32));
2069 SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, MVT::f32, Mul2);
2070 SDValue Mad2 = DAG.getNode(FMAD, DL, MVT::f32, Trunc,
2071 DAG.getConstantFP(APInt(32, 0xcf800000).bitsToFloat(), DL, MVT::f32),
2072 Mul1);
2073 SDValue Rcp_Lo = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Mad2);
2074 SDValue Rcp_Hi = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Trunc);
2075 SDValue Rcp64 = DAG.getBitcast(VT,
2076 DAG.getBuildVector(MVT::v2i32, DL, {Rcp_Lo, Rcp_Hi}));
2077
2078 SDValue Zero64 = DAG.getConstant(0, DL, VT);
2079 SDValue One64 = DAG.getConstant(1, DL, VT);
2080 SDValue Zero1 = DAG.getConstant(0, DL, MVT::i1);
2081 SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1);
2082
2083 // First round of UNR (Unsigned integer Newton-Raphson).
2084 SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS);
2085 SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64);
2086 SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1);
2087 SDValue Mulhi1_Lo, Mulhi1_Hi;
2088 std::tie(Mulhi1_Lo, Mulhi1_Hi) =
2089 DAG.SplitScalar(Mulhi1, DL, HalfVT, HalfVT);
2090 SDValue Add1_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Lo,
2091 Mulhi1_Lo, Zero1);
2092 SDValue Add1_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Hi,
2093 Mulhi1_Hi, Add1_Lo.getValue(1));
2094 SDValue Add1 = DAG.getBitcast(VT,
2095 DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi}));
2096
2097 // Second round of UNR.
2098 SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1);
2099 SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2);
2100 SDValue Mulhi2_Lo, Mulhi2_Hi;
2101 std::tie(Mulhi2_Lo, Mulhi2_Hi) =
2102 DAG.SplitScalar(Mulhi2, DL, HalfVT, HalfVT);
2103 SDValue Add2_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Lo,
2104 Mulhi2_Lo, Zero1);
2105 SDValue Add2_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Hi,
2106 Mulhi2_Hi, Add2_Lo.getValue(1));
2107 SDValue Add2 = DAG.getBitcast(VT,
2108 DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi}));
2109
2110 SDValue Mulhi3 = DAG.getNode(ISD::MULHU, DL, VT, LHS, Add2);
2111
2112 SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3);
2113
2114 SDValue Mul3_Lo, Mul3_Hi;
2115 std::tie(Mul3_Lo, Mul3_Hi) = DAG.SplitScalar(Mul3, DL, HalfVT, HalfVT);
2116 SDValue Sub1_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Lo,
2117 Mul3_Lo, Zero1);
2118 SDValue Sub1_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Hi,
2119 Mul3_Hi, Sub1_Lo.getValue(1));
2120 SDValue Sub1_Mi = DAG.getNode(ISD::SUB, DL, HalfVT, LHS_Hi, Mul3_Hi);
2121 SDValue Sub1 = DAG.getBitcast(VT,
2122 DAG.getBuildVector(MVT::v2i32, DL, {Sub1_Lo, Sub1_Hi}));
2123
2124 SDValue MinusOne = DAG.getConstant(0xffffffffu, DL, HalfVT);
2125 SDValue C1 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, MinusOne, Zero,
2126 ISD::SETUGE);
2127 SDValue C2 = DAG.getSelectCC(DL, Sub1_Lo, RHS_Lo, MinusOne, Zero,
2128 ISD::SETUGE);
2129 SDValue C3 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, C2, C1, ISD::SETEQ);
2130
2131 // TODO: Here and below portions of the code can be enclosed into if/endif.
2132 // Currently control flow is unconditional and we have 4 selects after
2133 // potential endif to substitute PHIs.
2134
2135 // if C3 != 0 ...
2136 SDValue Sub2_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Lo,
2137 RHS_Lo, Zero1);
2138 SDValue Sub2_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Mi,
2139 RHS_Hi, Sub1_Lo.getValue(1));
2140 SDValue Sub2_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi,
2141 Zero, Sub2_Lo.getValue(1));
2142 SDValue Sub2 = DAG.getBitcast(VT,
2143 DAG.getBuildVector(MVT::v2i32, DL, {Sub2_Lo, Sub2_Hi}));
2144
2145 SDValue Add3 = DAG.getNode(ISD::ADD, DL, VT, Mulhi3, One64);
2146
2147 SDValue C4 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, MinusOne, Zero,
2148 ISD::SETUGE);
2149 SDValue C5 = DAG.getSelectCC(DL, Sub2_Lo, RHS_Lo, MinusOne, Zero,
2150 ISD::SETUGE);
2151 SDValue C6 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, C5, C4, ISD::SETEQ);
2152
2153 // if (C6 != 0)
2154 SDValue Add4 = DAG.getNode(ISD::ADD, DL, VT, Add3, One64);
2155
2156 SDValue Sub3_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Lo,
2157 RHS_Lo, Zero1);
2158 SDValue Sub3_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi,
2159 RHS_Hi, Sub2_Lo.getValue(1));
2160 SDValue Sub3_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub3_Mi,
2161 Zero, Sub3_Lo.getValue(1));
2162 SDValue Sub3 = DAG.getBitcast(VT,
2163 DAG.getBuildVector(MVT::v2i32, DL, {Sub3_Lo, Sub3_Hi}));
2164
2165 // endif C6
2166 // endif C3
2167
2168 SDValue Sel1 = DAG.getSelectCC(DL, C6, Zero, Add4, Add3, ISD::SETNE);
2169 SDValue Div = DAG.getSelectCC(DL, C3, Zero, Sel1, Mulhi3, ISD::SETNE);
2170
2171 SDValue Sel2 = DAG.getSelectCC(DL, C6, Zero, Sub3, Sub2, ISD::SETNE);
2172 SDValue Rem = DAG.getSelectCC(DL, C3, Zero, Sel2, Sub1, ISD::SETNE);
2173
2174 Results.push_back(Div);
2175 Results.push_back(Rem);
2176
2177 return;
2178 }
2179
2180 // r600 expandion.
2181 // Get Speculative values
2182 SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
2183 SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
2184
2185 SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, Zero, REM_Part, LHS_Hi, ISD::SETEQ);
2186 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, Zero});
2187 REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);
2188
2189 SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, Zero, DIV_Part, Zero, ISD::SETEQ);
2190 SDValue DIV_Lo = Zero;
2191
2192 const unsigned halfBitWidth = HalfVT.getSizeInBits();
2193
2194 for (unsigned i = 0; i < halfBitWidth; ++i) {
2195 const unsigned bitPos = halfBitWidth - i - 1;
2196 SDValue POS = DAG.getConstant(bitPos, DL, HalfVT);
2197 // Get value of high bit
2198 SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
2199 HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, One);
2200 HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
2201
2202 // Shift
2203 REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT));
2204 // Add LHS high bit
2205 REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
2206
2207 SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT);
2208 SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, Zero, ISD::SETUGE);
2209
2210 DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
2211
2212 // Update REM
2213 SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
2214 REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
2215 }
2216
2217 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});
2218 DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);
2219 Results.push_back(DIV);
2220 Results.push_back(REM);
2221}
2222
2224 SelectionDAG &DAG) const {
2225 SDLoc DL(Op);
2226 EVT VT = Op.getValueType();
2227
2228 if (VT == MVT::i64) {
2230 LowerUDIVREM64(Op, DAG, Results);
2231 return DAG.getMergeValues(Results, DL);
2232 }
2233
2234 if (VT == MVT::i32) {
2235 if (SDValue Res = LowerDIVREM24(Op, DAG, false))
2236 return Res;
2237 }
2238
2239 SDValue X = Op.getOperand(0);
2240 SDValue Y = Op.getOperand(1);
2241
2242 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2243 // algorithm used here.
2244
2245 // Initial estimate of inv(y).
2246 SDValue Z = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Y);
2247
2248 // One round of UNR.
2249 SDValue NegY = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Y);
2250 SDValue NegYZ = DAG.getNode(ISD::MUL, DL, VT, NegY, Z);
2251 Z = DAG.getNode(ISD::ADD, DL, VT, Z,
2252 DAG.getNode(ISD::MULHU, DL, VT, Z, NegYZ));
2253
2254 // Quotient/remainder estimate.
2255 SDValue Q = DAG.getNode(ISD::MULHU, DL, VT, X, Z);
2256 SDValue R =
2257 DAG.getNode(ISD::SUB, DL, VT, X, DAG.getNode(ISD::MUL, DL, VT, Q, Y));
2258
2259 // First quotient/remainder refinement.
2260 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2261 SDValue One = DAG.getConstant(1, DL, VT);
2262 SDValue Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2263 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2264 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2265 R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2266 DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2267
2268 // Second quotient/remainder refinement.
2269 Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2270 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2271 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2272 R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2273 DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2274
2275 return DAG.getMergeValues({Q, R}, DL);
2276}
2277
2279 SelectionDAG &DAG) const {
2280 SDLoc DL(Op);
2281 EVT VT = Op.getValueType();
2282
2283 SDValue LHS = Op.getOperand(0);
2284 SDValue RHS = Op.getOperand(1);
2285
2286 SDValue Zero = DAG.getConstant(0, DL, VT);
2287 SDValue NegOne = DAG.getConstant(-1, DL, VT);
2288
2289 if (VT == MVT::i32) {
2290 if (SDValue Res = LowerDIVREM24(Op, DAG, true))
2291 return Res;
2292 }
2293
2294 if (VT == MVT::i64 &&
2295 DAG.ComputeNumSignBits(LHS) > 32 &&
2296 DAG.ComputeNumSignBits(RHS) > 32) {
2297 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
2298
2299 //HiLo split
2300 SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
2301 SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
2302 SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2303 LHS_Lo, RHS_Lo);
2304 SDValue Res[2] = {
2305 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
2306 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
2307 };
2308 return DAG.getMergeValues(Res, DL);
2309 }
2310
2311 SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
2312 SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
2313 SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
2314 SDValue RSign = LHSign; // Remainder sign is the same as LHS
2315
2316 LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
2317 RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
2318
2319 LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
2320 RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
2321
2322 SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
2323 SDValue Rem = Div.getValue(1);
2324
2325 Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
2326 Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
2327
2328 Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
2329 Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
2330
2331 SDValue Res[2] = {
2332 Div,
2333 Rem
2334 };
2335 return DAG.getMergeValues(Res, DL);
2336}
2337
2338// (frem x, y) -> (fma (fneg (ftrunc (fdiv x, y))), y, x)
2340 SDLoc SL(Op);
2341 EVT VT = Op.getValueType();
2342 auto Flags = Op->getFlags();
2343 SDValue X = Op.getOperand(0);
2344 SDValue Y = Op.getOperand(1);
2345
2346 SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y, Flags);
2347 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, VT, Div, Flags);
2348 SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Trunc, Flags);
2349 // TODO: For f32 use FMAD instead if !hasFastFMA32?
2350 return DAG.getNode(ISD::FMA, SL, VT, Neg, Y, X, Flags);
2351}
2352
2354 SDLoc SL(Op);
2355 SDValue Src = Op.getOperand(0);
2356
2357 // result = trunc(src)
2358 // if (src > 0.0 && src != result)
2359 // result += 1.0
2360
2361 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2362
2363 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2364 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
2365
2366 EVT SetCCVT =
2367 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2368
2369 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
2370 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2371 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2372
2373 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
2374 // TODO: Should this propagate fast-math-flags?
2375 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2376}
2377
2379 SelectionDAG &DAG) {
2380 const unsigned FractBits = 52;
2381 const unsigned ExpBits = 11;
2382
2383 SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
2384 Hi,
2385 DAG.getConstant(FractBits - 32, SL, MVT::i32),
2386 DAG.getConstant(ExpBits, SL, MVT::i32));
2387 SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
2388 DAG.getConstant(1023, SL, MVT::i32));
2389
2390 return Exp;
2391}
2392
2394 SDLoc SL(Op);
2395 SDValue Src = Op.getOperand(0);
2396
2397 assert(Op.getValueType() == MVT::f64);
2398
2399 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2400
2401 // Extract the upper half, since this is where we will find the sign and
2402 // exponent.
2403 SDValue Hi = getHiHalf64(Src, DAG);
2404
2405 SDValue Exp = extractF64Exponent(Hi, SL, DAG);
2406
2407 const unsigned FractBits = 52;
2408
2409 // Extract the sign bit.
2410 const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32);
2411 SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
2412
2413 // Extend back to 64-bits.
2414 SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
2415 SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
2416
2417 SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
2418 const SDValue FractMask
2419 = DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64);
2420
2421 SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
2422 SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
2423 SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
2424
2425 EVT SetCCVT =
2426 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
2427
2428 const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);
2429
2430 SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
2431 SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
2432
2433 SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
2434 SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
2435
2436 return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
2437}
2438
2440 SelectionDAG &DAG) const {
2441 SDLoc SL(Op);
2442 SDValue Src = Op.getOperand(0);
2443
2444 assert(Op.getValueType() == MVT::f64);
2445
2446 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2447 SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
2448 SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
2449
2450 // TODO: Should this propagate fast-math-flags?
2451
2452 SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
2453 SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
2454
2455 SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
2456
2457 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2458 SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
2459
2460 EVT SetCCVT =
2461 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2462 SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
2463
2464 return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
2465}
2466
2468 SelectionDAG &DAG) const {
2469 // FNEARBYINT and FRINT are the same, except in their handling of FP
2470 // exceptions. Those aren't really meaningful for us, and OpenCL only has
2471 // rint, so just treat them as equivalent.
2472 return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), Op.getValueType(),
2473 Op.getOperand(0));
2474}
2475
2477 auto VT = Op.getValueType();
2478 auto Arg = Op.getOperand(0u);
2479 return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), VT, Arg);
2480}
2481
2482// XXX - May require not supporting f32 denormals?
2483
2484// Don't handle v2f16. The extra instructions to scalarize and repack around the
2485// compare and vselect end up producing worse code than scalarizing the whole
2486// operation.
2488 SDLoc SL(Op);
2489 SDValue X = Op.getOperand(0);
2490 EVT VT = Op.getValueType();
2491
2492 SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X);
2493
2494 // TODO: Should this propagate fast-math-flags?
2495
2496 SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T);
2497
2498 SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff);
2499
2500 const SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2501 const SDValue One = DAG.getConstantFP(1.0, SL, VT);
2502
2503 EVT SetCCVT =
2504 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2505
2506 const SDValue Half = DAG.getConstantFP(0.5, SL, VT);
2507 SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
2508 SDValue OneOrZeroFP = DAG.getNode(ISD::SELECT, SL, VT, Cmp, One, Zero);
2509
2510 SDValue SignedOffset = DAG.getNode(ISD::FCOPYSIGN, SL, VT, OneOrZeroFP, X);
2511 return DAG.getNode(ISD::FADD, SL, VT, T, SignedOffset);
2512}
2513
2515 SDLoc SL(Op);
2516 SDValue Src = Op.getOperand(0);
2517
2518 // result = trunc(src);
2519 // if (src < 0.0 && src != result)
2520 // result += -1.0.
2521
2522 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2523
2524 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2525 const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);
2526
2527 EVT SetCCVT =
2528 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2529
2530 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
2531 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2532 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2533
2534 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
2535 // TODO: Should this propagate fast-math-flags?
2536 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2537}
2538
2539/// Return true if it's known that \p Src can never be an f32 denormal value.
2541 switch (Src.getOpcode()) {
2542 case ISD::FP_EXTEND:
2543 return Src.getOperand(0).getValueType() == MVT::f16;
2544 case ISD::FP16_TO_FP:
2545 case ISD::FFREXP:
2546 return true;
2548 unsigned IntrinsicID = Src.getConstantOperandVal(0);
2549 switch (IntrinsicID) {
2550 case Intrinsic::amdgcn_frexp_mant:
2551 return true;
2552 default:
2553 return false;
2554 }
2555 }
2556 default:
2557 return false;
2558 }
2559
2560 llvm_unreachable("covered opcode switch");
2561}
2562
2564 SDNodeFlags Flags) {
2565 if (Flags.hasApproximateFuncs())
2566 return true;
2567 auto &Options = DAG.getTarget().Options;
2568 return Options.UnsafeFPMath || Options.ApproxFuncFPMath;
2569}
2570
2572 SDValue Src,
2573 SDNodeFlags Flags) {
2574 return !valueIsKnownNeverF32Denorm(Src) &&
2575 DAG.getMachineFunction()
2578}
2579
2581 SDValue Src,
2582 SDNodeFlags Flags) const {
2583 SDLoc SL(Src);
2584 EVT VT = Src.getValueType();
2586 SDValue SmallestNormal =
2587 DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT);
2588
2589 // Want to scale denormals up, but negatives and 0 work just as well on the
2590 // scaled path.
2591 SDValue IsLtSmallestNormal = DAG.getSetCC(
2592 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src,
2593 SmallestNormal, ISD::SETOLT);
2594
2595 return IsLtSmallestNormal;
2596}
2597
2599 SDNodeFlags Flags) const {
2600 SDLoc SL(Src);
2601 EVT VT = Src.getValueType();
2603 SDValue Inf = DAG.getConstantFP(APFloat::getInf(Semantics), SL, VT);
2604
2605 SDValue Fabs = DAG.getNode(ISD::FABS, SL, VT, Src, Flags);
2606 SDValue IsFinite = DAG.getSetCC(
2607 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Fabs,
2608 Inf, ISD::SETOLT);
2609 return IsFinite;
2610}
2611
2612/// If denormal handling is required return the scaled input to FLOG2, and the
2613/// check for denormal range. Otherwise, return null values.
2614std::pair<SDValue, SDValue>
2616 SDValue Src, SDNodeFlags Flags) const {
2617 if (!needsDenormHandlingF32(DAG, Src, Flags))
2618 return {};
2619
2620 MVT VT = MVT::f32;
2621 const fltSemantics &Semantics = APFloat::IEEEsingle();
2622 SDValue SmallestNormal =
2623 DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT);
2624
2625 SDValue IsLtSmallestNormal = DAG.getSetCC(
2626 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src,
2627 SmallestNormal, ISD::SETOLT);
2628
2629 SDValue Scale32 = DAG.getConstantFP(0x1.0p+32, SL, VT);
2630 SDValue One = DAG.getConstantFP(1.0, SL, VT);
2631 SDValue ScaleFactor =
2632 DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, Scale32, One, Flags);
2633
2634 SDValue ScaledInput = DAG.getNode(ISD::FMUL, SL, VT, Src, ScaleFactor, Flags);
2635 return {ScaledInput, IsLtSmallestNormal};
2636}
2637
2639 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
2640 // If we have to handle denormals, scale up the input and adjust the result.
2641
2642 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
2643 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
2644
2645 SDLoc SL(Op);
2646 EVT VT = Op.getValueType();
2647 SDValue Src = Op.getOperand(0);
2648 SDNodeFlags Flags = Op->getFlags();
2649
2650 if (VT == MVT::f16) {
2651 // Nothing in half is a denormal when promoted to f32.
2652 assert(!Subtarget->has16BitInsts());
2653 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);
2654 SDValue Log = DAG.getNode(AMDGPUISD::LOG, SL, MVT::f32, Ext, Flags);
2655 return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,
2656 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
2657 }
2658
2659 auto [ScaledInput, IsLtSmallestNormal] =
2660 getScaledLogInput(DAG, SL, Src, Flags);
2661 if (!ScaledInput)
2662 return DAG.getNode(AMDGPUISD::LOG, SL, VT, Src, Flags);
2663
2664 SDValue Log2 = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags);
2665
2666 SDValue ThirtyTwo = DAG.getConstantFP(32.0, SL, VT);
2667 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2668 SDValue ResultOffset =
2669 DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, ThirtyTwo, Zero);
2670 return DAG.getNode(ISD::FSUB, SL, VT, Log2, ResultOffset, Flags);
2671}
2672
2673static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X,
2674 SDValue Y, SDValue C, SDNodeFlags Flags = SDNodeFlags()) {
2675 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Y, Flags);
2676 return DAG.getNode(ISD::FADD, SL, VT, Mul, C, Flags);
2677}
2678
2680 SelectionDAG &DAG) const {
2681 SDValue X = Op.getOperand(0);
2682 EVT VT = Op.getValueType();
2683 SDNodeFlags Flags = Op->getFlags();
2684 SDLoc DL(Op);
2685
2686 const bool IsLog10 = Op.getOpcode() == ISD::FLOG10;
2687 assert(IsLog10 || Op.getOpcode() == ISD::FLOG);
2688
2689 const auto &Options = getTargetMachine().Options;
2690 if (VT == MVT::f16 || Flags.hasApproximateFuncs() ||
2691 Options.ApproxFuncFPMath || Options.UnsafeFPMath) {
2692
2693 if (VT == MVT::f16 && !Subtarget->has16BitInsts()) {
2694 // Log and multiply in f32 is good enough for f16.
2695 X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X, Flags);
2696 }
2697
2698 SDValue Lowered = LowerFLOGUnsafe(X, DL, DAG, IsLog10, Flags);
2699 if (VT == MVT::f16 && !Subtarget->has16BitInsts()) {
2700 return DAG.getNode(ISD::FP_ROUND, DL, VT, Lowered,
2701 DAG.getTargetConstant(0, DL, MVT::i32), Flags);
2702 }
2703
2704 return Lowered;
2705 }
2706
2707 auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, DL, X, Flags);
2708 if (ScaledInput)
2709 X = ScaledInput;
2710
2711 SDValue Y = DAG.getNode(AMDGPUISD::LOG, DL, VT, X, Flags);
2712
2713 SDValue R;
2714 if (Subtarget->hasFastFMAF32()) {
2715 // c+cc are ln(2)/ln(10) to more than 49 bits
2716 const float c_log10 = 0x1.344134p-2f;
2717 const float cc_log10 = 0x1.09f79ep-26f;
2718
2719 // c + cc is ln(2) to more than 49 bits
2720 const float c_log = 0x1.62e42ep-1f;
2721 const float cc_log = 0x1.efa39ep-25f;
2722
2723 SDValue C = DAG.getConstantFP(IsLog10 ? c_log10 : c_log, DL, VT);
2724 SDValue CC = DAG.getConstantFP(IsLog10 ? cc_log10 : cc_log, DL, VT);
2725
2726 R = DAG.getNode(ISD::FMUL, DL, VT, Y, C, Flags);
2727 SDValue NegR = DAG.getNode(ISD::FNEG, DL, VT, R, Flags);
2728 SDValue FMA0 = DAG.getNode(ISD::FMA, DL, VT, Y, C, NegR, Flags);
2729 SDValue FMA1 = DAG.getNode(ISD::FMA, DL, VT, Y, CC, FMA0, Flags);
2730 R = DAG.getNode(ISD::FADD, DL, VT, R, FMA1, Flags);
2731 } else {
2732 // ch+ct is ln(2)/ln(10) to more than 36 bits
2733 const float ch_log10 = 0x1.344000p-2f;
2734 const float ct_log10 = 0x1.3509f6p-18f;
2735
2736 // ch + ct is ln(2) to more than 36 bits
2737 const float ch_log = 0x1.62e000p-1f;
2738 const float ct_log = 0x1.0bfbe8p-15f;
2739
2740 SDValue CH = DAG.getConstantFP(IsLog10 ? ch_log10 : ch_log, DL, VT);
2741 SDValue CT = DAG.getConstantFP(IsLog10 ? ct_log10 : ct_log, DL, VT);
2742
2743 SDValue YAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Y);
2744 SDValue MaskConst = DAG.getConstant(0xfffff000, DL, MVT::i32);
2745 SDValue YHInt = DAG.getNode(ISD::AND, DL, MVT::i32, YAsInt, MaskConst);
2746 SDValue YH = DAG.getNode(ISD::BITCAST, DL, MVT::f32, YHInt);
2747 SDValue YT = DAG.getNode(ISD::FSUB, DL, VT, Y, YH, Flags);
2748
2749 SDValue YTCT = DAG.getNode(ISD::FMUL, DL, VT, YT, CT, Flags);
2750 SDValue Mad0 = getMad(DAG, DL, VT, YH, CT, YTCT, Flags);
2751 SDValue Mad1 = getMad(DAG, DL, VT, YT, CH, Mad0, Flags);
2752 R = getMad(DAG, DL, VT, YH, CH, Mad1);
2753 }
2754
2755 const bool IsFiniteOnly = (Flags.hasNoNaNs() || Options.NoNaNsFPMath) &&
2756 (Flags.hasNoInfs() || Options.NoInfsFPMath);
2757
2758 // TODO: Check if known finite from source value.
2759 if (!IsFiniteOnly) {
2760 SDValue IsFinite = getIsFinite(DAG, Y, Flags);
2761 R = DAG.getNode(ISD::SELECT, DL, VT, IsFinite, R, Y, Flags);
2762 }
2763
2764 if (IsScaled) {
2765 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
2766 SDValue ShiftK =
2767 DAG.getConstantFP(IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f, DL, VT);
2768 SDValue Shift =
2769 DAG.getNode(ISD::SELECT, DL, VT, IsScaled, ShiftK, Zero, Flags);
2770 R = DAG.getNode(ISD::FSUB, DL, VT, R, Shift, Flags);
2771 }
2772
2773 return R;
2774}
2775
2777 return LowerFLOGCommon(Op, DAG);
2778}
2779
2780// Do f32 fast math expansion for flog2 or flog10. This is accurate enough for a
2781// promote f16 operation.
2783 SelectionDAG &DAG, bool IsLog10,
2784 SDNodeFlags Flags) const {
2785 EVT VT = Src.getValueType();
2786 unsigned LogOp =
2787 VT == MVT::f32 ? (unsigned)AMDGPUISD::LOG : (unsigned)ISD::FLOG2;
2788
2789 double Log2BaseInverted =
2791
2792 if (VT == MVT::f32) {
2793 auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, SL, Src, Flags);
2794 if (ScaledInput) {
2795 SDValue LogSrc = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags);
2796 SDValue ScaledResultOffset =
2797 DAG.getConstantFP(-32.0 * Log2BaseInverted, SL, VT);
2798
2799 SDValue Zero = DAG.getConstantFP(0.0f, SL, VT);
2800
2801 SDValue ResultOffset = DAG.getNode(ISD::SELECT, SL, VT, IsScaled,
2802 ScaledResultOffset, Zero, Flags);
2803
2804 SDValue Log2Inv = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2805
2806 if (Subtarget->hasFastFMAF32())
2807 return DAG.getNode(ISD::FMA, SL, VT, LogSrc, Log2Inv, ResultOffset,
2808 Flags);
2809 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, LogSrc, Log2Inv, Flags);
2810 return DAG.getNode(ISD::FADD, SL, VT, Mul, ResultOffset);
2811 }
2812 }
2813
2814 SDValue Log2Operand = DAG.getNode(LogOp, SL, VT, Src, Flags);
2815 SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2816
2817 return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand,
2818 Flags);
2819}
2820
2822 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
2823 // If we have to handle denormals, scale up the input and adjust the result.
2824
2825 SDLoc SL(Op);
2826 EVT VT = Op.getValueType();
2827 SDValue Src = Op.getOperand(0);
2828 SDNodeFlags Flags = Op->getFlags();
2829
2830 if (VT == MVT::f16) {
2831 // Nothing in half is a denormal when promoted to f32.
2832 assert(!Subtarget->has16BitInsts());
2833 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);
2834 SDValue Log = DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Ext, Flags);
2835 return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,
2836 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
2837 }
2838
2839 assert(VT == MVT::f32);
2840
2841 if (!needsDenormHandlingF32(DAG, Src, Flags))
2842 return DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Src, Flags);
2843
2844 // bool needs_scaling = x < -0x1.f80000p+6f;
2845 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
2846
2847 // -nextafter(128.0, -1)
2848 SDValue RangeCheckConst = DAG.getConstantFP(-0x1.f80000p+6f, SL, VT);
2849
2850 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2851
2852 SDValue NeedsScaling =
2853 DAG.getSetCC(SL, SetCCVT, Src, RangeCheckConst, ISD::SETOLT);
2854
2855 SDValue SixtyFour = DAG.getConstantFP(0x1.0p+6f, SL, VT);
2856 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2857
2858 SDValue AddOffset =
2859 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, SixtyFour, Zero);
2860
2861 SDValue AddInput = DAG.getNode(ISD::FADD, SL, VT, Src, AddOffset, Flags);
2862 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, AddInput, Flags);
2863
2864 SDValue TwoExpNeg64 = DAG.getConstantFP(0x1.0p-64f, SL, VT);
2865 SDValue One = DAG.getConstantFP(1.0, SL, VT);
2866 SDValue ResultScale =
2867 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, TwoExpNeg64, One);
2868
2869 return DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScale, Flags);
2870}
2871
2873 SelectionDAG &DAG,
2874 SDNodeFlags Flags) const {
2875 EVT VT = X.getValueType();
2876 const SDValue Log2E = DAG.getConstantFP(numbers::log2e, SL, VT);
2877
2878 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) {
2879 // exp2(M_LOG2E_F * f);
2880 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Log2E, Flags);
2881 return DAG.getNode(VT == MVT::f32 ? (unsigned)AMDGPUISD::EXP
2882 : (unsigned)ISD::FEXP2,
2883 SL, VT, Mul, Flags);
2884 }
2885
2886 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2887
2888 SDValue Threshold = DAG.getConstantFP(-0x1.5d58a0p+6f, SL, VT);
2889 SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT);
2890
2891 SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+6f, SL, VT);
2892
2893 SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags);
2894
2895 SDValue AdjustedX =
2896 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X);
2897
2898 SDValue ExpInput = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, Log2E, Flags);
2899
2900 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, ExpInput, Flags);
2901
2902 SDValue ResultScaleFactor = DAG.getConstantFP(0x1.969d48p-93f, SL, VT);
2903 SDValue AdjustedResult =
2904 DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScaleFactor, Flags);
2905
2906 return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, Exp2,
2907 Flags);
2908}
2909
2910/// Emit approx-funcs appropriate lowering for exp10. inf/nan should still be
2911/// handled correctly.
2913 SelectionDAG &DAG,
2914 SDNodeFlags Flags) const {
2915 const EVT VT = X.getValueType();
2916 const unsigned Exp2Op = VT == MVT::f32 ? AMDGPUISD::EXP : ISD::FEXP2;
2917
2918 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) {
2919 // exp2(x * 0x1.a92000p+1f) * exp2(x * 0x1.4f0978p-11f);
2920 SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT);
2921 SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT);
2922
2923 SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, X, K0, Flags);
2924 SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags);
2925 SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, X, K1, Flags);
2926 SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags);
2927 return DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1);
2928 }
2929
2930 // bool s = x < -0x1.2f7030p+5f;
2931 // x += s ? 0x1.0p+5f : 0.0f;
2932 // exp10 = exp2(x * 0x1.a92000p+1f) *
2933 // exp2(x * 0x1.4f0978p-11f) *
2934 // (s ? 0x1.9f623ep-107f : 1.0f);
2935
2936 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2937
2938 SDValue Threshold = DAG.getConstantFP(-0x1.2f7030p+5f, SL, VT);
2939 SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT);
2940
2941 SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+5f, SL, VT);
2942 SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags);
2943 SDValue AdjustedX =
2944 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X);
2945
2946 SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT);
2947 SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT);
2948
2949 SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K0, Flags);
2950 SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags);
2951 SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K1, Flags);
2952 SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags);
2953
2954 SDValue MulExps = DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1, Flags);
2955
2956 SDValue ResultScaleFactor = DAG.getConstantFP(0x1.9f623ep-107f, SL, VT);
2957 SDValue AdjustedResult =
2958 DAG.getNode(ISD::FMUL, SL, VT, MulExps, ResultScaleFactor, Flags);
2959
2960 return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, MulExps,
2961 Flags);
2962}
2963
2965 EVT VT = Op.getValueType();
2966 SDLoc SL(Op);
2967 SDValue X = Op.getOperand(0);
2968 SDNodeFlags Flags = Op->getFlags();
2969 const bool IsExp10 = Op.getOpcode() == ISD::FEXP10;
2970
2971 if (VT.getScalarType() == MVT::f16) {
2972 // v_exp_f16 (fmul x, log2e)
2973 if (allowApproxFunc(DAG, Flags)) // TODO: Does this really require fast?
2974 return lowerFEXPUnsafe(X, SL, DAG, Flags);
2975
2976 if (VT.isVector())
2977 return SDValue();
2978
2979 // exp(f16 x) ->
2980 // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
2981
2982 // Nothing in half is a denormal when promoted to f32.
2983 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, X, Flags);
2984 SDValue Lowered = lowerFEXPUnsafe(Ext, SL, DAG, Flags);
2985 return DAG.getNode(ISD::FP_ROUND, SL, VT, Lowered,
2986 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
2987 }
2988
2989 assert(VT == MVT::f32);
2990
2991 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
2992 // library behavior. Also, is known-not-daz source sufficient?
2993 if (allowApproxFunc(DAG, Flags)) {
2994 return IsExp10 ? lowerFEXP10Unsafe(X, SL, DAG, Flags)
2995 : lowerFEXPUnsafe(X, SL, DAG, Flags);
2996 }
2997
2998 // Algorithm:
2999 //
3000 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
3001 //
3002 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
3003 // n = 64*m + j, 0 <= j < 64
3004 //
3005 // e^x = 2^((64*m + j + f)/64)
3006 // = (2^m) * (2^(j/64)) * 2^(f/64)
3007 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
3008 //
3009 // f = x*(64/ln(2)) - n
3010 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
3011 //
3012 // e^x = (2^m) * (2^(j/64)) * e^r
3013 //
3014 // (2^(j/64)) is precomputed
3015 //
3016 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3017 // e^r = 1 + q
3018 //
3019 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3020 //
3021 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
3022 SDNodeFlags FlagsNoContract = Flags;
3023 FlagsNoContract.setAllowContract(false);
3024
3025 SDValue PH, PL;
3026 if (Subtarget->hasFastFMAF32()) {
3027 const float c_exp = numbers::log2ef;
3028 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
3029 const float c_exp10 = 0x1.a934f0p+1f;
3030 const float cc_exp10 = 0x1.2f346ep-24f;
3031
3032 SDValue C = DAG.getConstantFP(IsExp10 ? c_exp10 : c_exp, SL, VT);
3033 SDValue CC = DAG.getConstantFP(IsExp10 ? cc_exp10 : cc_exp, SL, VT);
3034
3035 PH = DAG.getNode(ISD::FMUL, SL, VT, X, C, Flags);
3036 SDValue NegPH = DAG.getNode(ISD::FNEG, SL, VT, PH, Flags);
3037 SDValue FMA0 = DAG.getNode(ISD::FMA, SL, VT, X, C, NegPH, Flags);
3038 PL = DAG.getNode(ISD::FMA, SL, VT, X, CC, FMA0, Flags);
3039 } else {
3040 const float ch_exp = 0x1.714000p+0f;
3041 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
3042
3043 const float ch_exp10 = 0x1.a92000p+1f;
3044 const float cl_exp10 = 0x1.4f0978p-11f;
3045
3046 SDValue CH = DAG.getConstantFP(IsExp10 ? ch_exp10 : ch_exp, SL, VT);
3047 SDValue CL = DAG.getConstantFP(IsExp10 ? cl_exp10 : cl_exp, SL, VT);
3048
3049 SDValue XAsInt = DAG.getNode(ISD::BITCAST, SL, MVT::i32, X);
3050 SDValue MaskConst = DAG.getConstant(0xfffff000, SL, MVT::i32);
3051 SDValue XHAsInt = DAG.getNode(ISD::AND, SL, MVT::i32, XAsInt, MaskConst);
3052 SDValue XH = DAG.getNode(ISD::BITCAST, SL, VT, XHAsInt);
3053 SDValue XL = DAG.getNode(ISD::FSUB, SL, VT, X, XH, Flags);
3054
3055 PH = DAG.getNode(ISD::FMUL, SL, VT, XH, CH, Flags);
3056
3057 SDValue XLCL = DAG.getNode(ISD::FMUL, SL, VT, XL, CL, Flags);
3058 SDValue Mad0 = getMad(DAG, SL, VT, XL, CH, XLCL, Flags);
3059 PL = getMad(DAG, SL, VT, XH, CL, Mad0, Flags);
3060 }
3061
3062 SDValue E = DAG.getNode(ISD::FROUNDEVEN, SL, VT, PH, Flags);
3063
3064 // It is unsafe to contract this fsub into the PH multiply.
3065 SDValue PHSubE = DAG.getNode(ISD::FSUB, SL, VT, PH, E, FlagsNoContract);
3066
3067 SDValue A = DAG.getNode(ISD::FADD, SL, VT, PHSubE, PL, Flags);
3068 SDValue IntE = DAG.getNode(ISD::FP_TO_SINT, SL, MVT::i32, E);
3069 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, A, Flags);
3070
3071 SDValue R = DAG.getNode(ISD::FLDEXP, SL, VT, Exp2, IntE, Flags);
3072
3073 SDValue UnderflowCheckConst =
3074 DAG.getConstantFP(IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f, SL, VT);
3075
3076 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
3077 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
3078 SDValue Underflow =
3079 DAG.getSetCC(SL, SetCCVT, X, UnderflowCheckConst, ISD::SETOLT);
3080
3081 R = DAG.getNode(ISD::SELECT, SL, VT, Underflow, Zero, R);
3082 const auto &Options = getTargetMachine().Options;
3083
3084 if (!Flags.hasNoInfs() && !Options.NoInfsFPMath) {
3085 SDValue OverflowCheckConst =
3086 DAG.getConstantFP(IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f, SL, VT);
3087 SDValue Overflow =
3088 DAG.getSetCC(SL, SetCCVT, X, OverflowCheckConst, ISD::SETOGT);
3089 SDValue Inf =
3091 R = DAG.getNode(ISD::SELECT, SL, VT, Overflow, Inf, R);
3092 }
3093
3094 return R;
3095}
3096
3097static bool isCtlzOpc(unsigned Opc) {
3098 return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
3099}
3100
3101static bool isCttzOpc(unsigned Opc) {
3102 return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF;
3103}
3104
3106 SelectionDAG &DAG) const {
3107 auto SL = SDLoc(Op);
3108 auto Arg = Op.getOperand(0u);
3109 auto ResultVT = Op.getValueType();
3110
3111 if (ResultVT != MVT::i8 && ResultVT != MVT::i16)
3112 return {};
3113
3114 assert(isCtlzOpc(Op.getOpcode()));
3115 assert(ResultVT == Arg.getValueType());
3116
3117 auto const LeadingZeroes = 32u - ResultVT.getFixedSizeInBits();
3118 auto SubVal = DAG.getConstant(LeadingZeroes, SL, MVT::i32);
3119 auto NewOp = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Arg);
3120 NewOp = DAG.getNode(Op.getOpcode(), SL, MVT::i32, NewOp);
3121 NewOp = DAG.getNode(ISD::SUB, SL, MVT::i32, NewOp, SubVal);
3122 return DAG.getNode(ISD::TRUNCATE, SL, ResultVT, NewOp);
3123}
3124
3126 SDLoc SL(Op);
3127 SDValue Src = Op.getOperand(0);
3128
3129 assert(isCtlzOpc(Op.getOpcode()) || isCttzOpc(Op.getOpcode()));
3130 bool Ctlz = isCtlzOpc(Op.getOpcode());
3131 unsigned NewOpc = Ctlz ? AMDGPUISD::FFBH_U32 : AMDGPUISD::FFBL_B32;
3132
3133 bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF ||
3134 Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF;
3135 bool Is64BitScalar = !Src->isDivergent() && Src.getValueType() == MVT::i64;
3136
3137 if (Src.getValueType() == MVT::i32 || Is64BitScalar) {
3138 // (ctlz hi:lo) -> (umin (ffbh src), 32)
3139 // (cttz hi:lo) -> (umin (ffbl src), 32)
3140 // (ctlz_zero_undef src) -> (ffbh src)
3141 // (cttz_zero_undef src) -> (ffbl src)
3142
3143 // 64-bit scalar version produce 32-bit result
3144 // (ctlz hi:lo) -> (umin (S_FLBIT_I32_B64 src), 64)
3145 // (cttz hi:lo) -> (umin (S_FF1_I32_B64 src), 64)
3146 // (ctlz_zero_undef src) -> (S_FLBIT_I32_B64 src)
3147 // (cttz_zero_undef src) -> (S_FF1_I32_B64 src)
3148 SDValue NewOpr = DAG.getNode(NewOpc, SL, MVT::i32, Src);
3149 if (!ZeroUndef) {
3150 const SDValue ConstVal = DAG.getConstant(
3151 Op.getValueType().getScalarSizeInBits(), SL, MVT::i32);
3152 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, ConstVal);
3153 }
3154 return DAG.getNode(ISD::ZERO_EXTEND, SL, Src.getValueType(), NewOpr);
3155 }
3156
3157 SDValue Lo, Hi;
3158 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3159
3160 SDValue OprLo = DAG.getNode(NewOpc, SL, MVT::i32, Lo);
3161 SDValue OprHi = DAG.getNode(NewOpc, SL, MVT::i32, Hi);
3162
3163 // (ctlz hi:lo) -> (umin3 (ffbh hi), (uaddsat (ffbh lo), 32), 64)
3164 // (cttz hi:lo) -> (umin3 (uaddsat (ffbl hi), 32), (ffbl lo), 64)
3165 // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
3166 // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
3167
3168 unsigned AddOpc = ZeroUndef ? ISD::ADD : ISD::UADDSAT;
3169 const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32);
3170 if (Ctlz)
3171 OprLo = DAG.getNode(AddOpc, SL, MVT::i32, OprLo, Const32);
3172 else
3173 OprHi = DAG.getNode(AddOpc, SL, MVT::i32, OprHi, Const32);
3174
3175 SDValue NewOpr;
3176 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, OprLo, OprHi);
3177 if (!ZeroUndef) {
3178 const SDValue Const64 = DAG.getConstant(64, SL, MVT::i32);
3179 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const64);
3180 }
3181
3182 return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr);
3183}
3184
3186 bool Signed) const {
3187 // The regular method converting a 64-bit integer to float roughly consists of
3188 // 2 steps: normalization and rounding. In fact, after normalization, the
3189 // conversion from a 64-bit integer to a float is essentially the same as the
3190 // one from a 32-bit integer. The only difference is that it has more
3191 // trailing bits to be rounded. To leverage the native 32-bit conversion, a
3192 // 64-bit integer could be preprocessed and fit into a 32-bit integer then
3193 // converted into the correct float number. The basic steps for the unsigned
3194 // conversion are illustrated in the following pseudo code:
3195 //
3196 // f32 uitofp(i64 u) {
3197 // i32 hi, lo = split(u);
3198 // // Only count the leading zeros in hi as we have native support of the
3199 // // conversion from i32 to f32. If hi is all 0s, the conversion is
3200 // // reduced to a 32-bit one automatically.
3201 // i32 shamt = clz(hi); // Return 32 if hi is all 0s.
3202 // u <<= shamt;
3203 // hi, lo = split(u);
3204 // hi |= (lo != 0) ? 1 : 0; // Adjust rounding bit in hi based on lo.
3205 // // convert it as a 32-bit integer and scale the result back.
3206 // return uitofp(hi) * 2^(32 - shamt);
3207 // }
3208 //
3209 // The signed one follows the same principle but uses 'ffbh_i32' to count its
3210 // sign bits instead. If 'ffbh_i32' is not available, its absolute value is
3211 // converted instead followed by negation based its sign bit.
3212
3213 SDLoc SL(Op);
3214 SDValue Src = Op.getOperand(0);
3215
3216 SDValue Lo, Hi;
3217 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3218 SDValue Sign;
3219 SDValue ShAmt;
3220 if (Signed && Subtarget->isGCN()) {
3221 // We also need to consider the sign bit in Lo if Hi has just sign bits,
3222 // i.e. Hi is 0 or -1. However, that only needs to take the MSB into
3223 // account. That is, the maximal shift is
3224 // - 32 if Lo and Hi have opposite signs;
3225 // - 33 if Lo and Hi have the same sign.
3226 //
3227 // Or, MaxShAmt = 33 + OppositeSign, where
3228 //
3229 // OppositeSign is defined as ((Lo ^ Hi) >> 31), which is
3230 // - -1 if Lo and Hi have opposite signs; and
3231 // - 0 otherwise.
3232 //
3233 // All in all, ShAmt is calculated as
3234 //
3235 // umin(sffbh(Hi), 33 + (Lo^Hi)>>31) - 1.
3236 //
3237 // or
3238 //
3239 // umin(sffbh(Hi) - 1, 32 + (Lo^Hi)>>31).
3240 //
3241 // to reduce the critical path.
3242 SDValue OppositeSign = DAG.getNode(
3243 ISD::SRA, SL, MVT::i32, DAG.getNode(ISD::XOR, SL, MVT::i32, Lo, Hi),
3244 DAG.getConstant(31, SL, MVT::i32));
3245 SDValue MaxShAmt =
3246 DAG.getNode(ISD::ADD, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
3247 OppositeSign);
3248 // Count the leading sign bits.
3249 ShAmt = DAG.getNode(AMDGPUISD::FFBH_I32, SL, MVT::i32, Hi);
3250 // Different from unsigned conversion, the shift should be one bit less to
3251 // preserve the sign bit.
3252 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, ShAmt,
3253 DAG.getConstant(1, SL, MVT::i32));
3254 ShAmt = DAG.getNode(ISD::UMIN, SL, MVT::i32, ShAmt, MaxShAmt);
3255 } else {
3256 if (Signed) {
3257 // Without 'ffbh_i32', only leading zeros could be counted. Take the
3258 // absolute value first.
3259 Sign = DAG.getNode(ISD::SRA, SL, MVT::i64, Src,
3260 DAG.getConstant(63, SL, MVT::i64));
3261 SDValue Abs =
3262 DAG.getNode(ISD::XOR, SL, MVT::i64,
3263 DAG.getNode(ISD::ADD, SL, MVT::i64, Src, Sign), Sign);
3264 std::tie(Lo, Hi) = split64BitValue(Abs, DAG);
3265 }
3266 // Count the leading zeros.
3267 ShAmt = DAG.getNode(ISD::CTLZ, SL, MVT::i32, Hi);
3268 // The shift amount for signed integers is [0, 32].
3269 }
3270 // Normalize the given 64-bit integer.
3271 SDValue Norm = DAG.getNode(ISD::SHL, SL, MVT::i64, Src, ShAmt);
3272 // Split it again.
3273 std::tie(Lo, Hi) = split64BitValue(Norm, DAG);
3274 // Calculate the adjust bit for rounding.
3275 // (lo != 0) ? 1 : 0 => (lo >= 1) ? 1 : 0 => umin(1, lo)
3276 SDValue Adjust = DAG.getNode(ISD::UMIN, SL, MVT::i32,
3277 DAG.getConstant(1, SL, MVT::i32), Lo);
3278 // Get the 32-bit normalized integer.
3279 Norm = DAG.getNode(ISD::OR, SL, MVT::i32, Hi, Adjust);
3280 // Convert the normalized 32-bit integer into f32.
3281 unsigned Opc =
3282 (Signed && Subtarget->isGCN()) ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
3283 SDValue FVal = DAG.getNode(Opc, SL, MVT::f32, Norm);
3284
3285 // Finally, need to scale back the converted floating number as the original
3286 // 64-bit integer is converted as a 32-bit one.
3287 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
3288 ShAmt);
3289 // On GCN, use LDEXP directly.
3290 if (Subtarget->isGCN())
3291 return DAG.getNode(ISD::FLDEXP, SL, MVT::f32, FVal, ShAmt);
3292
3293 // Otherwise, align 'ShAmt' to the exponent part and add it into the exponent
3294 // part directly to emulate the multiplication of 2^ShAmt. That 8-bit
3295 // exponent is enough to avoid overflowing into the sign bit.
3296 SDValue Exp = DAG.getNode(ISD::SHL, SL, MVT::i32, ShAmt,
3297 DAG.getConstant(23, SL, MVT::i32));
3298 SDValue IVal =
3299 DAG.getNode(ISD::ADD, SL, MVT::i32,
3300 DAG.getNode(ISD::BITCAST, SL, MVT::i32, FVal), Exp);
3301 if (Signed) {
3302 // Set the sign bit.
3303 Sign = DAG.getNode(ISD::SHL, SL, MVT::i32,
3304 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Sign),
3305 DAG.getConstant(31, SL, MVT::i32));
3306 IVal = DAG.getNode(ISD::OR, SL, MVT::i32, IVal, Sign);
3307 }
3308 return DAG.getNode(ISD::BITCAST, SL, MVT::f32, IVal);
3309}
3310
3312 bool Signed) const {
3313 SDLoc SL(Op);
3314 SDValue Src = Op.getOperand(0);
3315
3316 SDValue Lo, Hi;
3317 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3318
3320 SL, MVT::f64, Hi);
3321
3322 SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
3323
3324 SDValue LdExp = DAG.getNode(ISD::FLDEXP, SL, MVT::f64, CvtHi,
3325 DAG.getConstant(32, SL, MVT::i32));
3326 // TODO: Should this propagate fast-math-flags?
3327 return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
3328}
3329
3331 SelectionDAG &DAG) const {
3332 // TODO: Factor out code common with LowerSINT_TO_FP.
3333 EVT DestVT = Op.getValueType();
3334 SDValue Src = Op.getOperand(0);
3335 EVT SrcVT = Src.getValueType();
3336
3337 if (SrcVT == MVT::i16) {
3338 if (DestVT == MVT::f16)
3339 return Op;
3340 SDLoc DL(Op);
3341
3342 // Promote src to i32
3343 SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Src);
3344 return DAG.getNode(ISD::UINT_TO_FP, DL, DestVT, Ext);
3345 }
3346
3347 if (DestVT == MVT::bf16) {
3348 SDLoc SL(Op);
3349 SDValue ToF32 = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f32, Src);
3350 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true);
3351 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag);
3352 }
3353
3354 if (SrcVT != MVT::i64)
3355 return Op;
3356
3357 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
3358 SDLoc DL(Op);
3359
3360 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
3361 SDValue FPRoundFlag =
3362 DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);
3363 SDValue FPRound =
3364 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
3365
3366 return FPRound;
3367 }
3368
3369 if (DestVT == MVT::f32)
3370 return LowerINT_TO_FP32(Op, DAG, false);
3371
3372 assert(DestVT == MVT::f64);
3373 return LowerINT_TO_FP64(Op, DAG, false);
3374}
3375
3377 SelectionDAG &DAG) const {
3378 EVT DestVT = Op.getValueType();
3379
3380 SDValue Src = Op.getOperand(0);
3381 EVT SrcVT = Src.getValueType();
3382
3383 if (SrcVT == MVT::i16) {
3384 if (DestVT == MVT::f16)
3385 return Op;
3386
3387 SDLoc DL(Op);
3388 // Promote src to i32
3389 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, Src);
3390 return DAG.getNode(ISD::SINT_TO_FP, DL, DestVT, Ext);
3391 }
3392
3393 if (DestVT == MVT::bf16) {
3394 SDLoc SL(Op);
3395 SDValue ToF32 = DAG.getNode(ISD::SINT_TO_FP, SL, MVT::f32, Src);
3396 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true);
3397 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag);
3398 }
3399
3400 if (SrcVT != MVT::i64)
3401 return Op;
3402
3403 // TODO: Factor out code common with LowerUINT_TO_FP.
3404
3405 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
3406 SDLoc DL(Op);
3407 SDValue Src = Op.getOperand(0);
3408
3409 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
3410 SDValue FPRoundFlag =
3411 DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);
3412 SDValue FPRound =
3413 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
3414
3415 return FPRound;
3416 }
3417
3418 if (DestVT == MVT::f32)
3419 return LowerINT_TO_FP32(Op, DAG, true);
3420
3421 assert(DestVT == MVT::f64);
3422 return LowerINT_TO_FP64(Op, DAG, true);
3423}
3424
3426 bool Signed) const {
3427 SDLoc SL(Op);
3428
3429 SDValue Src = Op.getOperand(0);
3430 EVT SrcVT = Src.getValueType();
3431
3432 assert(SrcVT == MVT::f32 || SrcVT == MVT::f64);
3433
3434 // The basic idea of converting a floating point number into a pair of 32-bit
3435 // integers is illustrated as follows:
3436 //
3437 // tf := trunc(val);
3438 // hif := floor(tf * 2^-32);
3439 // lof := tf - hif * 2^32; // lof is always positive due to floor.
3440 // hi := fptoi(hif);
3441 // lo := fptoi(lof);
3442 //
3443 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, SrcVT, Src);
3444 SDValue Sign;
3445 if (Signed && SrcVT == MVT::f32) {
3446 // However, a 32-bit floating point number has only 23 bits mantissa and
3447 // it's not enough to hold all the significant bits of `lof` if val is
3448 // negative. To avoid the loss of precision, We need to take the absolute
3449 // value after truncating and flip the result back based on the original
3450 // signedness.
3451 Sign = DAG.getNode(ISD::SRA, SL, MVT::i32,
3452 DAG.getNode(ISD::BITCAST, SL, MVT::i32, Trunc),
3453 DAG.getConstant(31, SL, MVT::i32));
3454 Trunc = DAG.getNode(ISD::FABS, SL, SrcVT, Trunc);
3455 }
3456
3457 SDValue K0, K1;
3458 if (SrcVT == MVT::f64) {
3459 K0 = DAG.getConstantFP(
3460 llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)), SL,
3461 SrcVT);
3462 K1 = DAG.getConstantFP(
3463 llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)), SL,
3464 SrcVT);
3465 } else {
3466 K0 = DAG.getConstantFP(
3467 llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)), SL, SrcVT);
3468 K1 = DAG.getConstantFP(
3469 llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)), SL, SrcVT);
3470 }
3471 // TODO: Should this propagate fast-math-flags?
3472 SDValue Mul = DAG.getNode(ISD::FMUL, SL, SrcVT, Trunc, K0);
3473
3474 SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, SrcVT, Mul);
3475
3476 SDValue Fma = DAG.getNode(ISD::FMA, SL, SrcVT, FloorMul, K1, Trunc);
3477
3478 SDValue Hi = DAG.getNode((Signed && SrcVT == MVT::f64) ? ISD::FP_TO_SINT
3480 SL, MVT::i32, FloorMul);
3481 SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
3482
3483 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
3484 DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi}));
3485
3486 if (Signed && SrcVT == MVT::f32) {
3487 assert(Sign);
3488 // Flip the result based on the signedness, which is either all 0s or 1s.
3489 Sign = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
3490 DAG.getBuildVector(MVT::v2i32, SL, {Sign, Sign}));
3491 // r := xor(r, sign) - sign;
3492 Result =
3493 DAG.getNode(ISD::SUB, SL, MVT::i64,
3494 DAG.getNode(ISD::XOR, SL, MVT::i64, Result, Sign), Sign);
3495 }
3496
3497 return Result;
3498}
3499
3501 SDLoc DL(Op);
3502 SDValue N0 = Op.getOperand(0);
3503
3504 // Convert to target node to get known bits
3505 if (N0.getValueType() == MVT::f32)
3506 return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0);
3507
3508 if (getTargetMachine().Options.UnsafeFPMath) {
3509 // There is a generic expand for FP_TO_FP16 with unsafe fast math.
3510 return SDValue();
3511 }
3512
3513 assert(N0.getSimpleValueType() == MVT::f64);
3514
3515 // f64 -> f16 conversion using round-to-nearest-even rounding mode.
3516 const unsigned ExpMask = 0x7ff;
3517 const unsigned ExpBiasf64 = 1023;
3518 const unsigned ExpBiasf16 = 15;
3519 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
3520 SDValue One = DAG.getConstant(1, DL, MVT::i32);
3521 SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, N0);
3522 SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U,
3523 DAG.getConstant(32, DL, MVT::i64));
3524 UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32);
3525 U = DAG.getZExtOrTrunc(U, DL, MVT::i32);
3526 SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3527 DAG.getConstant(20, DL, MVT::i64));
3528 E = DAG.getNode(ISD::AND, DL, MVT::i32, E,
3529 DAG.getConstant(ExpMask, DL, MVT::i32));
3530 // Subtract the fp64 exponent bias (1023) to get the real exponent and
3531 // add the f16 bias (15) to get the biased exponent for the f16 format.
3532 E = DAG.getNode(ISD::ADD, DL, MVT::i32, E,
3533 DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32));
3534
3535 SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3536 DAG.getConstant(8, DL, MVT::i32));
3537 M = DAG.getNode(ISD::AND, DL, MVT::i32, M,
3538 DAG.getConstant(0xffe, DL, MVT::i32));
3539
3540 SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH,
3541 DAG.getConstant(0x1ff, DL, MVT::i32));
3542 MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U);
3543
3544 SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ);
3545 M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set);
3546
3547 // (M != 0 ? 0x0200 : 0) | 0x7c00;
3548 SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32,
3549 DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32),
3550 Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32));
3551
3552 // N = M | (E << 12);
3553 SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M,
3554 DAG.getNode(ISD::SHL, DL, MVT::i32, E,
3555 DAG.getConstant(12, DL, MVT::i32)));
3556
3557 // B = clamp(1-E, 0, 13);
3558 SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32,
3559 One, E);
3560 SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero);
3561 B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B,
3562 DAG.getConstant(13, DL, MVT::i32));
3563
3564 SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M,
3565 DAG.getConstant(0x1000, DL, MVT::i32));
3566
3567 SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B);
3568 SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B);
3569 SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE);
3570 D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1);
3571
3572 SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT);
3573 SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V,
3574 DAG.getConstant(0x7, DL, MVT::i32));
3575 V = DAG.getNode(ISD::SRL, DL, MVT::i32, V,
3576 DAG.getConstant(2, DL, MVT::i32));
3577 SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32),
3578 One, Zero, ISD::SETEQ);
3579 SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32),
3580 One, Zero, ISD::SETGT);
3581 V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1);
3582 V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1);
3583
3584 V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32),
3585 DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT);
3586 V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32),
3587 I, V, ISD::SETEQ);
3588
3589 // Extract the sign bit.
3590 SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3591 DAG.getConstant(16, DL, MVT::i32));
3592 Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign,
3593 DAG.getConstant(0x8000, DL, MVT::i32));
3594
3595 V = DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V);
3596 return DAG.getZExtOrTrunc(V, DL, Op.getValueType());
3597}
3598
3600 SelectionDAG &DAG) const {
3601 SDValue Src = Op.getOperand(0);
3602 unsigned OpOpcode = Op.getOpcode();
3603 EVT SrcVT = Src.getValueType();
3604 EVT DestVT = Op.getValueType();
3605
3606 // Will be selected natively
3607 if (SrcVT == MVT::f16 && DestVT == MVT::i16)
3608 return Op;
3609
3610 if (SrcVT == MVT::bf16) {
3611 SDLoc DL(Op);
3612 SDValue PromotedSrc = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
3613 return DAG.getNode(Op.getOpcode(), DL, DestVT, PromotedSrc);
3614 }
3615
3616 // Promote i16 to i32
3617 if (DestVT == MVT::i16 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
3618 SDLoc DL(Op);
3619
3620 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
3621 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToInt32);
3622 }
3623
3624 if (DestVT != MVT::i64)
3625 return Op;
3626
3627 if (SrcVT == MVT::f16 ||
3628 (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {
3629 SDLoc DL(Op);
3630
3631 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
3632 unsigned Ext =
3634 return DAG.getNode(Ext, DL, MVT::i64, FpToInt32);
3635 }
3636
3637 if (SrcVT == MVT::f32 || SrcVT == MVT::f64)
3638 return LowerFP_TO_INT64(Op, DAG, OpOpcode == ISD::FP_TO_SINT);
3639
3640 return SDValue();
3641}
3642
3644 SelectionDAG &DAG) const {
3645 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
3646 MVT VT = Op.getSimpleValueType();
3647 MVT ScalarVT = VT.getScalarType();
3648
3649 assert(VT.isVector());
3650
3651 SDValue Src = Op.getOperand(0);
3652 SDLoc DL(Op);
3653
3654 // TODO: Don't scalarize on Evergreen?
3655 unsigned NElts = VT.getVectorNumElements();
3657 DAG.ExtractVectorElements(Src, Args, 0, NElts);
3658
3659 SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
3660 for (unsigned I = 0; I < NElts; ++I)
3661 Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
3662
3663 return DAG.getBuildVector(VT, DL, Args);
3664}
3665
3666//===----------------------------------------------------------------------===//
3667// Custom DAG optimizations
3668//===----------------------------------------------------------------------===//
3669
3670static bool isU24(SDValue Op, SelectionDAG &DAG) {
3671 return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24;
3672}
3673
3674static bool isI24(SDValue Op, SelectionDAG &DAG) {
3675 EVT VT = Op.getValueType();
3676 return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
3677 // as unsigned 24-bit values.
3679}
3680
3683 SelectionDAG &DAG = DCI.DAG;
3684 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
3685 bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
3686
3687 SDValue LHS = IsIntrin ? Node24->getOperand(1) : Node24->getOperand(0);
3688 SDValue RHS = IsIntrin ? Node24->getOperand(2) : Node24->getOperand(1);
3689 unsigned NewOpcode = Node24->getOpcode();
3690 if (IsIntrin) {
3691 unsigned IID = Node24->getConstantOperandVal(0);
3692 switch (IID) {
3693 case Intrinsic::amdgcn_mul_i24:
3694 NewOpcode = AMDGPUISD::MUL_I24;
3695 break;
3696 case Intrinsic::amdgcn_mul_u24:
3697 NewOpcode = AMDGPUISD::MUL_U24;
3698 break;
3699 case Intrinsic::amdgcn_mulhi_i24:
3700 NewOpcode = AMDGPUISD::MULHI_I24;
3701 break;
3702 case Intrinsic::amdgcn_mulhi_u24:
3703 NewOpcode = AMDGPUISD::MULHI_U24;
3704 break;
3705 default:
3706 llvm_unreachable("Expected 24-bit mul intrinsic");
3707 }
3708 }
3709
3710 APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24);
3711
3712 // First try to simplify using SimplifyMultipleUseDemandedBits which allows
3713 // the operands to have other uses, but will only perform simplifications that
3714 // involve bypassing some nodes for this user.
3715 SDValue DemandedLHS = TLI.SimplifyMultipleUseDemandedBits(LHS, Demanded, DAG);
3716 SDValue DemandedRHS = TLI.SimplifyMultipleUseDemandedBits(RHS, Demanded, DAG);
3717 if (DemandedLHS || DemandedRHS)
3718 return DAG.getNode(NewOpcode, SDLoc(Node24), Node24->getVTList(),
3719 DemandedLHS ? DemandedLHS : LHS,
3720 DemandedRHS ? DemandedRHS : RHS);
3721
3722 // Now try SimplifyDemandedBits which can simplify the nodes used by our
3723 // operands if this node is the only user.
3724 if (TLI.SimplifyDemandedBits(LHS, Demanded, DCI))
3725 return SDValue(Node24, 0);
3726 if (TLI.SimplifyDemandedBits(RHS, Demanded, DCI))
3727 return SDValue(Node24, 0);
3728
3729 return SDValue();
3730}
3731
3732template <typename IntTy>
3734 uint32_t Width, const SDLoc &DL) {
3735 if (Width + Offset < 32) {
3736 uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
3737 IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
3738 return DAG.getConstant(Result, DL, MVT::i32);
3739 }
3740
3741 return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
3742}
3743
3744static bool hasVolatileUser(SDNode *Val) {
3745 for (SDNode *U : Val->uses()) {
3746 if (MemSDNode *M = dyn_cast<MemSDNode>(U)) {
3747 if (M->isVolatile())
3748 return true;
3749 }
3750 }
3751
3752 return false;
3753}
3754
3756 // i32 vectors are the canonical memory type.
3757 if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
3758 return false;
3759
3760 if (!VT.isByteSized())
3761 return false;
3762
3763 unsigned Size = VT.getStoreSize();
3764
3765 if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
3766 return false;
3767
3768 if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
3769 return false;
3770
3771 return true;
3772}
3773
3774// Replace load of an illegal type with a store of a bitcast to a friendlier
3775// type.
3777 DAGCombinerInfo &DCI) const {
3778 if (!DCI.isBeforeLegalize())
3779 return SDValue();
3780
3781 LoadSDNode *LN = cast<LoadSDNode>(N);
3782 if (!LN->isSimple() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
3783 return SDValue();
3784
3785 SDLoc SL(N);
3786 SelectionDAG &DAG = DCI.DAG;
3787 EVT VT = LN->getMemoryVT();
3788
3789 unsigned Size = VT.getStoreSize();
3790 Align Alignment = LN->getAlign();
3791 if (Alignment < Size && isTypeLegal(VT)) {
3792 unsigned IsFast;
3793 unsigned AS = LN->getAddressSpace();
3794
3795 // Expand unaligned loads earlier than legalization. Due to visitation order
3796 // problems during legalization, the emitted instructions to pack and unpack
3797 // the bytes again are not eliminated in the case of an unaligned copy.
3799 VT, AS, Alignment, LN->getMemOperand()->getFlags(), &IsFast)) {
3800 if (VT.isVector())
3801 return SplitVectorLoad(SDValue(LN, 0), DAG);
3802
3803 SDValue Ops[2];
3804 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
3805
3806 return DAG.getMergeValues(Ops, SDLoc(N));
3807 }
3808
3809 if (!IsFast)
3810 return SDValue();
3811 }
3812
3813 if (!shouldCombineMemoryType(VT))
3814 return SDValue();
3815
3816 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3817
3818 SDValue NewLoad
3819 = DAG.getLoad(NewVT, SL, LN->getChain(),
3820 LN->getBasePtr(), LN->getMemOperand());
3821
3822 SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad);
3823 DCI.CombineTo(N, BC, NewLoad.getValue(1));
3824 return SDValue(N, 0);
3825}
3826
3827// Replace store of an illegal type with a store of a bitcast to a friendlier
3828// type.
3830 DAGCombinerInfo &DCI) const {
3831 if (!DCI.isBeforeLegalize())
3832 return SDValue();
3833
3834 StoreSDNode *SN = cast<StoreSDNode>(N);
3835 if (!SN->isSimple() || !ISD::isNormalStore(SN))
3836 return SDValue();
3837
3838 EVT VT = SN->getMemoryVT();
3839 unsigned Size = VT.getStoreSize();
3840
3841 SDLoc SL(N);
3842 SelectionDAG &DAG = DCI.DAG;
3843 Align Alignment = SN->getAlign();
3844 if (Alignment < Size && isTypeLegal(VT)) {
3845 unsigned IsFast;
3846 unsigned AS = SN->getAddressSpace();
3847
3848 // Expand unaligned stores earlier than legalization. Due to visitation
3849 // order problems during legalization, the emitted instructions to pack and
3850 // unpack the bytes again are not eliminated in the case of an unaligned
3851 // copy.
3853 VT, AS, Alignment, SN->getMemOperand()->getFlags(), &IsFast)) {
3854 if (VT.isVector())
3855 return SplitVectorStore(SDValue(SN, 0), DAG);
3856
3857 return expandUnalignedStore(SN, DAG);
3858 }
3859
3860 if (!IsFast)
3861 return SDValue();
3862 }
3863
3864 if (!shouldCombineMemoryType(VT))
3865 return SDValue();
3866
3867 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3868 SDValue Val = SN->getValue();
3869
3870 //DCI.AddToWorklist(Val.getNode());
3871
3872 bool OtherUses = !Val.hasOneUse();
3873 SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val);
3874 if (OtherUses) {
3875 SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal);
3876 DAG.ReplaceAllUsesOfValueWith(Val, CastBack);
3877 }
3878
3879 return DAG.getStore(SN->getChain(), SL, CastVal,
3880 SN->getBasePtr(), SN->getMemOperand());
3881}
3882
3883// FIXME: This should go in generic DAG combiner with an isTruncateFree check,
3884// but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
3885// issues.
3887 DAGCombinerInfo &DCI) const {
3888 SelectionDAG &DAG = DCI.DAG;
3889 SDValue N0 = N->getOperand(0);
3890
3891 // (vt2 (assertzext (truncate vt0:x), vt1)) ->
3892 // (vt2 (truncate (assertzext vt0:x, vt1)))
3893 if (N0.getOpcode() == ISD::TRUNCATE) {
3894 SDValue N1 = N->getOperand(1);
3895 EVT ExtVT = cast<VTSDNode>(N1)->getVT();
3896 SDLoc SL(N);
3897
3898 SDValue Src = N0.getOperand(0);
3899 EVT SrcVT = Src.getValueType();
3900 if (SrcVT.bitsGE(ExtVT)) {
3901 SDValue NewInReg = DAG.getNode(N->getOpcode(), SL, SrcVT, Src, N1);
3902 return DAG.getNode(ISD::TRUNCATE, SL, N->getValueType(0), NewInReg);
3903 }
3904 }
3905
3906 return SDValue();
3907}
3908
3910 SDNode *N, DAGCombinerInfo &DCI) const {
3911 unsigned IID = N->getConstantOperandVal(0);
3912 switch (IID) {
3913 case Intrinsic::amdgcn_mul_i24:
3914 case Intrinsic::amdgcn_mul_u24:
3915 case Intrinsic::amdgcn_mulhi_i24:
3916 case Intrinsic::amdgcn_mulhi_u24:
3917 return simplifyMul24(N, DCI);
3918 case Intrinsic::amdgcn_fract:
3919 case Intrinsic::amdgcn_rsq:
3920 case Intrinsic::amdgcn_rcp_legacy:
3921 case Intrinsic::amdgcn_rsq_legacy:
3922 case Intrinsic::amdgcn_rsq_clamp: {
3923 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
3924 SDValue Src = N->getOperand(1);
3925 return Src.isUndef() ? Src : SDValue();
3926 }
3927 case Intrinsic::amdgcn_frexp_exp: {
3928 // frexp_exp (fneg x) -> frexp_exp x
3929 // frexp_exp (fabs x) -> frexp_exp x
3930 // frexp_exp (fneg (fabs x)) -> frexp_exp x
3931 SDValue Src = N->getOperand(1);
3932 SDValue PeekSign = peekFPSignOps(Src);
3933 if (PeekSign == Src)
3934 return SDValue();
3935 return SDValue(DCI.DAG.UpdateNodeOperands(N, N->getOperand(0), PeekSign),
3936 0);
3937 }
3938 default:
3939 return SDValue();
3940 }
3941}
3942
3943/// Split the 64-bit value \p LHS into two 32-bit components, and perform the
3944/// binary operation \p Opc to it with the corresponding constant operands.
3946 DAGCombinerInfo &DCI, const SDLoc &SL,
3947 unsigned Opc, SDValue LHS,
3948 uint32_t ValLo, uint32_t ValHi) const {
3949 SelectionDAG &DAG = DCI.DAG;
3950 SDValue Lo, Hi;
3951 std::tie(Lo, Hi) = split64BitValue(LHS, DAG);
3952
3953 SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32);
3954 SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32);
3955
3956 SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS);
3957 SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS);
3958
3959 // Re-visit the ands. It's possible we eliminated one of them and it could
3960 // simplify the vector.
3961 DCI.AddToWorklist(Lo.getNode());
3962 DCI.AddToWorklist(Hi.getNode());
3963
3964 SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd});
3965 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
3966}
3967
3969 DAGCombinerInfo &DCI) const {
3970 EVT VT = N->getValueType(0);
3971
3972 ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3973 if (!RHS)
3974 return SDValue();
3975
3976 SDValue LHS = N->getOperand(0);
3977 unsigned RHSVal = RHS->getZExtValue();
3978 if (!RHSVal)
3979 return LHS;
3980
3981 SDLoc SL(N);
3982 SelectionDAG &DAG = DCI.DAG;
3983
3984 switch (LHS->getOpcode()) {
3985 default:
3986 break;
3987 case ISD::ZERO_EXTEND:
3988 case ISD::SIGN_EXTEND:
3989 case ISD::ANY_EXTEND: {
3990 SDValue X = LHS->getOperand(0);
3991
3992 if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 &&
3993 isOperationLegal(ISD::BUILD_VECTOR, MVT::v2i16)) {
3994 // Prefer build_vector as the canonical form if packed types are legal.
3995 // (shl ([asz]ext i16:x), 16 -> build_vector 0, x
3996 SDValue Vec = DAG.getBuildVector(MVT::v2i16, SL,
3997 { DAG.getConstant(0, SL, MVT::i16), LHS->getOperand(0) });
3998 return DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
3999 }
4000
4001 // shl (ext x) => zext (shl x), if shift does not overflow int
4002 if (VT != MVT::i64)
4003 break;
4004 KnownBits Known = DAG.computeKnownBits(X);
4005 unsigned LZ = Known.countMinLeadingZeros();
4006 if (LZ < RHSVal)
4007 break;
4008 EVT XVT = X.getValueType();
4009 SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0));
4010 return DAG.getZExtOrTrunc(Shl, SL, VT);
4011 }
4012 }
4013
4014 if (VT != MVT::i64)
4015 return SDValue();
4016
4017 // i64 (shl x, C) -> (build_pair 0, (shl x, C -32))
4018
4019 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
4020 // common case, splitting this into a move and a 32-bit shift is faster and
4021 // the same code size.
4022 if (RHSVal < 32)
4023 return SDValue();
4024
4025 SDValue ShiftAmt = DAG.getConstant(RHSVal - 32, SL, MVT::i32);
4026
4027 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
4028 SDValue NewShift = DAG.getNode(ISD::SHL, SL, MVT::i32, Lo, ShiftAmt);
4029
4030 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
4031
4032 SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {Zero, NewShift});
4033 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
4034}
4035
4037 DAGCombinerInfo &DCI) const {
4038 if (N->getValueType(0) != MVT::i64)
4039 return SDValue();
4040
4041 const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
4042 if (!RHS)
4043 return SDValue();
4044
4045 SelectionDAG &DAG = DCI.DAG;
4046 SDLoc SL(N);
4047 unsigned RHSVal = RHS->getZExtValue();
4048
4049 // (sra i64:x, 32) -> build_pair x, (sra hi_32(x), 31)
4050 if (RHSVal == 32) {
4051 SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
4052 SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
4053 DAG.getConstant(31, SL, MVT::i32));
4054
4055 SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {Hi, NewShift});
4056 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
4057 }
4058
4059 // (sra i64:x, 63) -> build_pair (sra hi_32(x), 31), (sra hi_32(x), 31)
4060 if (RHSVal == 63) {
4061 SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
4062 SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
4063 DAG.getConstant(31, SL, MVT::i32));
4064 SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, NewShift});
4065 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
4066 }
4067
4068 return SDValue();
4069}
4070
4072 DAGCombinerInfo &DCI) const {
4073 auto *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
4074 if (!RHS)
4075 return SDValue();
4076
4077 EVT VT = N->getValueType(0);
4078 SDValue LHS = N->getOperand(0);
4079 unsigned ShiftAmt = RHS->getZExtValue();
4080 SelectionDAG &DAG = DCI.DAG;
4081 SDLoc SL(N);
4082
4083 // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1)
4084 // this improves the ability to match BFE patterns in isel.
4085 if (LHS.getOpcode() == ISD::AND) {
4086 if (auto *Mask = dyn_cast<ConstantSDNode>(LHS.getOperand(1))) {
4087 unsigned MaskIdx, MaskLen;
4088 if (Mask->getAPIntValue().isShiftedMask(MaskIdx, MaskLen) &&
4089 MaskIdx == ShiftAmt) {
4090 return DAG.getNode(
4091 ISD::AND, SL, VT,
4092 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0), N->getOperand(1)),
4093 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(1), N->getOperand(1)));
4094 }
4095 }
4096 }
4097
4098 if (VT != MVT::i64)
4099 return SDValue();
4100
4101 if (ShiftAmt < 32)
4102 return SDValue();
4103
4104 // srl i64:x, C for C >= 32
4105 // =>
4106 // build_pair (srl hi_32(x), C - 32), 0
4107 SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
4108
4109 SDValue Hi = getHiHalf64(LHS, DAG);
4110
4111 SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32);
4112 SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst);
4113
4114 SDValue BuildPair = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, Zero});
4115
4116 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair);
4117}
4118
4120 SDNode *N, DAGCombinerInfo &DCI) const {
4121 SDLoc SL(N);
4122 SelectionDAG &DAG = DCI.DAG;
4123 EVT VT = N->getValueType(0);
4124 SDValue Src = N->getOperand(0);
4125
4126 // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x)
4127 if (Src.getOpcode() == ISD::BITCAST && !VT.isVector()) {
4128 SDValue Vec = Src.getOperand(0);
4129 if (Vec.getOpcode() == ISD::BUILD_VECTOR) {
4130 SDValue Elt0 = Vec.getOperand(0);
4131 EVT EltVT = Elt0.getValueType();
4132 if (VT.getFixedSizeInBits() <= EltVT.getFixedSizeInBits()) {
4133 if (EltVT.isFloatingPoint()) {
4134 Elt0 = DAG.getNode(ISD::BITCAST, SL,
4135 EltVT.changeTypeToInteger(), Elt0);
4136 }
4137
4138 return DAG.getNode(ISD::TRUNCATE, SL, VT, Elt0);
4139 }
4140 }
4141 }
4142
4143 // Equivalent of above for accessing the high element of a vector as an
4144 // integer operation.
4145 // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)
4146 if (Src.getOpcode() == ISD::SRL && !VT.isVector()) {
4147 if (auto K = isConstOrConstSplat(Src.getOperand(1))) {
4148 if (2 * K->getZExtValue() == Src.getValueType().getScalarSizeInBits()) {
4149 SDValue BV = stripBitcast(Src.getOperand(0));
4150 if (BV.getOpcode() == ISD::BUILD_VECTOR &&
4151 BV.getValueType().getVectorNumElements() == 2) {
4152 SDValue SrcElt = BV.getOperand(1);
4153 EVT SrcEltVT = SrcElt.getValueType();
4154 if (SrcEltVT.isFloatingPoint()) {
4155 SrcElt = DAG.getNode(ISD::BITCAST, SL,
4156 SrcEltVT.changeTypeToInteger(), SrcElt);
4157 }
4158
4159 return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt);
4160 }
4161 }
4162 }
4163 }
4164
4165 // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.
4166 //
4167 // i16 (trunc (srl i64:x, K)), K <= 16 ->
4168 // i16 (trunc (srl (i32 (trunc x), K)))
4169 if (VT.getScalarSizeInBits() < 32) {
4170 EVT SrcVT = Src.getValueType();
4171 if (SrcVT.getScalarSizeInBits() > 32 &&
4172 (Src.getOpcode() == ISD::SRL ||
4173 Src.getOpcode() == ISD::SRA ||
4174 Src.getOpcode() == ISD::SHL)) {
4175 SDValue Amt = Src.getOperand(1);
4176 KnownBits Known = DAG.computeKnownBits(Amt);
4177
4178 // - For left shifts, do the transform as long as the shift
4179 // amount is still legal for i32, so when ShiftAmt < 32 (<= 31)
4180 // - For right shift, do it if ShiftAmt <= (32 - Size) to avoid
4181 // losing information stored in the high bits when truncating.
4182 const unsigned MaxCstSize =
4183 (Src.getOpcode() == ISD::SHL) ? 31 : (32 - VT.getScalarSizeInBits());
4184 if (Known.getMaxValue().ule(MaxCstSize)) {
4185 EVT MidVT = VT.isVector() ?
4186 EVT::getVectorVT(*DAG.getContext(), MVT::i32,
4187 VT.getVectorNumElements()) : MVT::i32;
4188
4189 EVT NewShiftVT = getShiftAmountTy(MidVT, DAG.getDataLayout());
4190 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MidVT,
4191 Src.getOperand(0));
4192 DCI.AddToWorklist(Trunc.getNode());
4193
4194 if (Amt.getValueType() != NewShiftVT) {
4195 Amt = DAG.getZExtOrTrunc(Amt, SL, NewShiftVT);
4196 DCI.AddToWorklist(Amt.getNode());
4197 }
4198
4199 SDValue ShrunkShift = DAG.getNode(Src.getOpcode(), SL, MidVT,
4200 Trunc, Amt);
4201 return DAG.getNode(ISD::TRUNCATE, SL, VT, ShrunkShift);
4202 }
4203 }
4204 }
4205
4206 return SDValue();
4207}
4208
4209// We need to specifically handle i64 mul here to avoid unnecessary conversion
4210// instructions. If we only match on the legalized i64 mul expansion,
4211// SimplifyDemandedBits will be unable to remove them because there will be
4212// multiple uses due to the separate mul + mulh[su].
4213static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
4214 SDValue N0, SDValue N1, unsigned Size, bool Signed) {
4215 if (Size <= 32) {
4216 unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
4217 return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1);
4218 }
4219
4220 unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
4221 unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
4222
4223 SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1);
4224 SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1);
4225
4226 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, MulLo, MulHi);
4227}
4228
4229/// If \p V is an add of a constant 1, returns the other operand. Otherwise
4230/// return SDValue().
4231static SDValue getAddOneOp(const SDNode *V) {
4232 if (V->getOpcode() != ISD::ADD)
4233 return SDValue();
4234
4235 return isOneConstant(V->getOperand(1)) ? V->getOperand(0) : SDValue();
4236}
4237
4239 DAGCombinerInfo &DCI) const {
4240 assert(N->getOpcode() == ISD::MUL);
4241 EVT VT = N->getValueType(0);
4242
4243 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4244 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4245 // unnecessarily). isDivergent() is used as an approximation of whether the
4246 // value is in an SGPR.
4247 if (!N->isDivergent())
4248 return SDValue();
4249
4250 unsigned Size = VT.getSizeInBits();
4251 if (VT.isVector() || Size > 64)
4252 return SDValue();
4253
4254 SelectionDAG &DAG = DCI.DAG;
4255 SDLoc DL(N);
4256
4257 SDValue N0 = N->getOperand(0);
4258 SDValue N1 = N->getOperand(1);
4259
4260 // Undo InstCombine canonicalize X * (Y + 1) -> X * Y + X to enable mad
4261 // matching.
4262
4263 // mul x, (add y, 1) -> add (mul x, y), x
4264 auto IsFoldableAdd = [](SDValue V) -> SDValue {
4265 SDValue AddOp = getAddOneOp(V.getNode());
4266 if (!AddOp)
4267 return SDValue();
4268
4269 if (V.hasOneUse() || all_of(V->uses(), [](const SDNode *U) -> bool {
4270 return U->getOpcode() == ISD::MUL;
4271 }))
4272 return AddOp;
4273
4274 return SDValue();
4275 };
4276
4277 // FIXME: The selection pattern is not properly checking for commuted
4278 // operands, so we have to place the mul in the LHS
4279 if (SDValue MulOper = IsFoldableAdd(N0)) {
4280 SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N1, MulOper);
4281 return DAG.getNode(ISD::ADD, DL, VT, MulVal, N1);
4282 }
4283
4284 if (SDValue MulOper = IsFoldableAdd(N1)) {
4285 SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N0, MulOper);
4286 return DAG.getNode(ISD::ADD, DL, VT, MulVal, N0);
4287 }
4288
4289 // There are i16 integer mul/mad.
4290 if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16))
4291 return SDValue();
4292
4293 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
4294 // in the source into any_extends if the result of the mul is truncated. Since
4295 // we can assume the high bits are whatever we want, use the underlying value
4296 // to avoid the unknown high bits from interfering.
4297 if (N0.getOpcode() == ISD::ANY_EXTEND)
4298 N0 = N0.getOperand(0);
4299
4300 if (N1.getOpcode() == ISD::ANY_EXTEND)
4301 N1 = N1.getOperand(0);
4302
4303 SDValue Mul;
4304
4305 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
4306 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4307 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4308 Mul = getMul24(DAG, DL, N0, N1, Size, false);
4309 } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
4310 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4311 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4312 Mul = getMul24(DAG, DL, N0, N1, Size, true);
4313 } else {
4314 return SDValue();
4315 }
4316
4317 // We need to use sext even for MUL_U24, because MUL_U24 is used
4318 // for signed multiply of 8 and 16-bit types.
4319 return DAG.getSExtOrTrunc(Mul, DL, VT);
4320}
4321
4322SDValue
4324 DAGCombinerInfo &DCI) const {
4325 if (N->getValueType(0) != MVT::i32)
4326 return SDValue();
4327
4328 SelectionDAG &DAG = DCI.DAG;
4329 SDLoc DL(N);
4330
4331 SDValue N0 = N->getOperand(0);
4332 SDValue N1 = N->getOperand(1);
4333
4334 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
4335 // in the source into any_extends if the result of the mul is truncated. Since
4336 // we can assume the high bits are whatever we want, use the underlying value
4337 // to avoid the unknown high bits from interfering.
4338 if (N0.getOpcode() == ISD::ANY_EXTEND)
4339 N0 = N0.getOperand(0);
4340 if (N1.getOpcode() == ISD::ANY_EXTEND)
4341 N1 = N1.getOperand(0);
4342
4343 // Try to use two fast 24-bit multiplies (one for each half of the result)
4344 // instead of one slow extending multiply.
4345 unsigned LoOpcode, HiOpcode;
4346 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
4347 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4348 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4349 LoOpcode = AMDGPUISD::MUL_U24;
4350 HiOpcode = AMDGPUISD::MULHI_U24;
4351 } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
4352 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4353 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4354 LoOpcode = AMDGPUISD::MUL_I24;
4355 HiOpcode = AMDGPUISD::MULHI_I24;
4356 } else {
4357 return SDValue();
4358 }
4359
4360 SDValue Lo = DAG.getNode(LoOpcode, DL, MVT::i32, N0, N1);
4361 SDValue Hi = DAG.getNode(HiOpcode, DL, MVT::i32, N0, N1);
4362 DCI.CombineTo(N, Lo, Hi);
4363 return SDValue(N, 0);
4364}
4365
4367 DAGCombinerInfo &DCI) const {
4368 EVT VT = N->getValueType(0);
4369
4370 if (!Subtarget->hasMulI24() || VT.isVector())
4371 return SDValue();
4372
4373 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4374 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4375 // unnecessarily). isDivergent() is used as an approximation of whether the
4376 // value is in an SGPR.
4377 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
4378 // valu op anyway)
4379 if (Subtarget->hasSMulHi() && !N->isDivergent())
4380 return SDValue();
4381
4382 SelectionDAG &DAG = DCI.DAG;
4383 SDLoc DL(N);
4384
4385 SDValue N0 = N->getOperand(0);
4386 SDValue N1 = N->getOperand(1);
4387
4388 if (!isI24(N0, DAG) || !isI24(N1, DAG))
4389 return SDValue();
4390
4391 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4392 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4393
4394 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1);
4395 DCI.AddToWorklist(Mulhi.getNode());
4396 return DAG.getSExtOrTrunc(Mulhi, DL, VT);
4397}
4398
4400 DAGCombinerInfo &DCI) const {
4401 EVT VT = N->getValueType(0);
4402
4403 if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32)
4404 return SDValue();
4405
4406 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4407 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4408 // unnecessarily). isDivergent() is used as an approximation of whether the
4409 // value is in an SGPR.
4410 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
4411 // valu op anyway)
4412 if (Subtarget->hasSMulHi() && !N->isDivergent())
4413 return SDValue();
4414
4415 SelectionDAG &DAG = DCI.DAG;
4416 SDLoc DL(N);
4417
4418 SDValue N0 = N->getOperand(0);
4419 SDValue N1 = N->getOperand(1);
4420
4421 if (!isU24(N0, DAG) || !isU24(N1, DAG))
4422 return SDValue();
4423
4424 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4425 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4426
4427 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1);
4428 DCI.AddToWorklist(Mulhi.getNode());
4429 return DAG.getZExtOrTrunc(Mulhi, DL, VT);
4430}
4431
4432SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG,
4433 SDValue Op,
4434 const SDLoc &DL,
4435 unsigned Opc) const {
4436 EVT VT = Op.getValueType();
4437 EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT);
4438 if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() &&
4439 LegalVT != MVT::i16))
4440 return SDValue();
4441
4442 if (VT != MVT::i32)
4443 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op);
4444
4445 SDValue FFBX = DAG.getNode(Opc, DL, MVT::i32, Op);
4446 if (VT != MVT::i32)
4447 FFBX = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBX);
4448
4449 return FFBX;
4450}
4451
4452// The native instructions return -1 on 0 input. Optimize out a select that
4453// produces -1 on 0.
4454//
4455// TODO: If zero is not undef, we could also do this if the output is compared
4456// against the bitwidth.
4457//
4458// TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
4460 SDValue LHS, SDValue RHS,
4461 DAGCombinerInfo &DCI) const {
4462 if (!isNullConstant(Cond.getOperand(1)))
4463 return SDValue();
4464
4465 SelectionDAG &DAG = DCI.DAG;
4466 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
4467 SDValue CmpLHS = Cond.getOperand(0);
4468
4469 // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
4470 // select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x
4471 if (CCOpcode == ISD::SETEQ &&
4472 (isCtlzOpc(RHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&
4473 RHS.getOperand(0) == CmpLHS && isAllOnesConstant(LHS)) {
4474 unsigned Opc =
4476 return getFFBX_U32(DAG, CmpLHS, SL, Opc);
4477 }
4478
4479 // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
4480 // select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x
4481 if (CCOpcode == ISD::SETNE &&
4482 (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(LHS.getOpcode())) &&
4483 LHS.getOperand(0) == CmpLHS && isAllOnesConstant(RHS)) {
4484 unsigned Opc =
4486
4487 return getFFBX_U32(DAG, CmpLHS, SL, Opc);
4488 }
4489
4490 return SDValue();
4491}
4492
4494 unsigned Op,
4495 const SDLoc &SL,
4496 SDValue Cond,
4497 SDValue N1,
4498 SDValue N2) {
4499 SelectionDAG &DAG = DCI.DAG;
4500 EVT VT = N1.getValueType();
4501
4502 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond,
4503 N1.getOperand(0), N2.getOperand(0));
4504 DCI.AddToWorklist(NewSelect.getNode());
4505 return DAG.getNode(Op, SL, VT, NewSelect);
4506}
4507
4508// Pull a free FP operation out of a select so it may fold into uses.
4509//
4510// select c, (fneg x), (fneg y) -> fneg (select c, x, y)
4511// select c, (fneg x), k -> fneg (select c, x, (fneg k))
4512//
4513// select c, (fabs x), (fabs y) -> fabs (select c, x, y)
4514// select c, (fabs x), +k -> fabs (select c, x, k)
4515SDValue
4517 SDValue N) const {
4518 SelectionDAG &DAG = DCI.DAG;
4519 SDValue Cond = N.getOperand(0);
4520 SDValue LHS = N.getOperand(1);
4521 SDValue RHS = N.getOperand(2);
4522
4523 EVT VT = N.getValueType();
4524 if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||
4525 (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
4527 return SDValue();
4528
4529 return distributeOpThroughSelect(DCI, LHS.getOpcode(),
4530 SDLoc(N), Cond, LHS, RHS);
4531 }
4532
4533 bool Inv = false;
4534 if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) {
4535 std::swap(LHS, RHS);
4536 Inv = true;
4537 }
4538
4539 // TODO: Support vector constants.
4540 ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
4541 if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS &&
4542 !selectSupportsSourceMods(N.getNode())) {
4543 SDLoc SL(N);
4544 // If one side is an fneg/fabs and the other is a constant, we can push the
4545 // fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
4546 SDValue NewLHS = LHS.getOperand(0);
4547 SDValue NewRHS = RHS;
4548
4549 // Careful: if the neg can be folded up, don't try to pull it back down.
4550 bool ShouldFoldNeg = true;
4551
4552 if (NewLHS.hasOneUse()) {
4553 unsigned Opc = NewLHS.getOpcode();
4554 if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(NewLHS.getNode()))
4555 ShouldFoldNeg = false;
4556 if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
4557 ShouldFoldNeg = false;
4558 }
4559
4560 if (ShouldFoldNeg) {
4561 if (LHS.getOpcode() == ISD::FABS && CRHS->isNegative())
4562 return SDValue();
4563
4564 // We're going to be forced to use a source modifier anyway, there's no
4565 // point to pulling the negate out unless we can get a size reduction by
4566 // negating the constant.
4567 //
4568 // TODO: Generalize to use getCheaperNegatedExpression which doesn't know
4569 // about cheaper constants.
4570 if (NewLHS.getOpcode() == ISD::FABS &&
4572 return SDValue();
4573
4575 return SDValue();
4576
4577 if (LHS.getOpcode() == ISD::FNEG)
4578 NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4579
4580 if (Inv)
4581 std::swap(NewLHS, NewRHS);
4582
4583 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
4584 Cond, NewLHS, NewRHS);
4585 DCI.AddToWorklist(NewSelect.getNode());
4586 return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
4587 }
4588 }
4589
4590 return SDValue();
4591}
4592
4594 DAGCombinerInfo &DCI) const {
4595 if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
4596 return Folded;
4597
4598 SDValue Cond = N->getOperand(0);
4599 if (Cond.getOpcode() != ISD::SETCC)
4600 return SDValue();
4601
4602 EVT VT = N->getValueType(0);
4603 SDValue LHS = Cond.getOperand(0);
4604 SDValue RHS = Cond.getOperand(1);
4605 SDValue CC = Cond.getOperand(2);
4606
4607 SDValue True = N->getOperand(1);
4608 SDValue False = N->getOperand(2);
4609
4610 if (Cond.hasOneUse()) { // TODO: Look for multiple select uses.
4611 SelectionDAG &DAG = DCI.DAG;
4612 if (DAG.isConstantValueOfAnyType(True) &&
4613 !DAG.isConstantValueOfAnyType(False)) {
4614 // Swap cmp + select pair to move constant to false input.
4615 // This will allow using VOPC cndmasks more often.
4616 // select (setcc x, y), k, x -> select (setccinv x, y), x, k
4617
4618 SDLoc SL(N);
4619 ISD::CondCode NewCC =
4620 getSetCCInverse(cast<CondCodeSDNode>(CC)->get(), LHS.getValueType());
4621
4622 SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC);
4623 return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True);
4624 }
4625
4626 if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) {
4628 = combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
4629 // Revisit this node so we can catch min3/max3/med3 patterns.
4630 //DCI.AddToWorklist(MinMax.getNode());
4631 return MinMax;
4632 }
4633 }
4634
4635 // There's no reason to not do this if the condition has other uses.
4636 return performCtlz_CttzCombine(SDLoc(N), Cond, True, False, DCI);
4637}
4638
4639static bool isInv2Pi(const APFloat &APF) {
4640 static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118));
4641 static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983));
4642 static const APFloat KF64(APFloat::IEEEdouble(), APInt(64, 0x3fc45f306dc9c882));
4643
4644 return APF.bitwiseIsEqual(KF16) ||
4645 APF.bitwiseIsEqual(KF32) ||
4646 APF.bitwiseIsEqual(KF64);
4647}
4648
4649// 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an
4650// additional cost to negate them.
4653 if (C->isZero())
4654 return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
4655
4656 if (Subtarget->hasInv2PiInlineImm() && isInv2Pi(C->getValueAPF()))
4657 return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
4658
4660}
4661
4665 return false;
4666}
4667
4671 return false;
4672}
4673
4674static unsigned inverseMinMax(unsigned Opc) {
4675 switch (Opc) {
4676 case ISD::FMAXNUM:
4677 return ISD::FMINNUM;
4678 case ISD::FMINNUM:
4679 return ISD::FMAXNUM;
4680 case ISD::FMAXNUM_IEEE:
4681 return ISD::FMINNUM_IEEE;
4682 case ISD::FMINNUM_IEEE:
4683 return ISD::FMAXNUM_IEEE;
4684 case ISD::FMAXIMUM:
4685 return ISD::FMINIMUM;
4686 case ISD::FMINIMUM:
4687 return ISD::FMAXIMUM;
4692 default:
4693 llvm_unreachable("invalid min/max opcode");
4694 }
4695}
4696
4697/// \return true if it's profitable to try to push an fneg into its source
4698/// instruction.
4700 // If the input has multiple uses and we can either fold the negate down, or
4701 // the other uses cannot, give up. This both prevents unprofitable
4702 // transformations and infinite loops: we won't repeatedly try to fold around
4703 // a negate that has no 'good' form.
4704 if (N0.hasOneUse()) {
4705 // This may be able to fold into the source, but at a code size cost. Don't
4706 // fold if the fold into the user is free.
4707 if (allUsesHaveSourceMods(N, 0))
4708 return false;
4709 } else {
4710 if (fnegFoldsIntoOp(N0.getNode()) &&
4712 return false;
4713 }
4714
4715 return true;
4716}
4717
4719 DAGCombinerInfo &DCI) const {
4720 SelectionDAG &DAG = DCI.DAG;
4721 SDValue N0 = N->getOperand(0);
4722 EVT VT = N->getValueType(0);
4723
4724 unsigned Opc = N0.getOpcode();
4725
4726 if (!shouldFoldFNegIntoSrc(N, N0))
4727 return SDValue();
4728
4729 SDLoc SL(N);
4730 switch (Opc) {
4731 case ISD::FADD: {
4732 if (!mayIgnoreSignedZero(N0))
4733 return SDValue();
4734
4735 // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
4736 SDValue LHS = N0.getOperand(0);
4737 SDValue RHS = N0.getOperand(1);
4738
4739 if (LHS.getOpcode() != ISD::FNEG)
4740 LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
4741 else
4742 LHS = LHS.getOperand(0);
4743
4744 if (RHS.getOpcode() != ISD::FNEG)
4745 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4746 else
4747 RHS = RHS.getOperand(0);
4748
4749 SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS, N0->getFlags());
4750 if (Res.getOpcode() != ISD::FADD)
4751 return SDValue(); // Op got folded away.
4752 if (!N0.hasOneUse())
4753 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
4754 return Res;
4755 }
4756 case ISD::FMUL:
4758 // (fneg (fmul x, y)) -> (fmul x, (fneg y))
4759 // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y))
4760 SDValue LHS = N0.getOperand(0);
4761 SDValue RHS = N0.getOperand(1);
4762
4763 if (LHS.getOpcode() == ISD::FNEG)
4764 LHS = LHS.getOperand(0);
4765 else if (RHS.getOpcode() == ISD::FNEG)
4766 RHS = RHS.getOperand(0);
4767 else
4768 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4769
4770 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS, N0->getFlags());
4771 if (Res.getOpcode() != Opc)
4772 return SDValue(); // Op got folded away.
4773 if (!N0.hasOneUse())
4774 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
4775 return Res;
4776 }
4777 case ISD::FMA:
4778 case ISD::FMAD: {
4779 // TODO: handle llvm.amdgcn.fma.legacy
4780 if (!mayIgnoreSignedZero(N0))
4781 return SDValue();
4782
4783 // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))
4784 SDValue LHS = N0.getOperand(0);
4785 SDValue MHS = N0.getOperand(1);
4786 SDValue RHS = N0.getOperand(2);
4787
4788 if (LHS.getOpcode() == ISD::FNEG)
4789 LHS = LHS.getOperand(0);
4790 else if (MHS.getOpcode() == ISD::FNEG)
4791 MHS = MHS.getOperand(0);
4792 else
4793 MHS = DAG.getNode(ISD::FNEG, SL, VT, MHS);
4794
4795 if (RHS.getOpcode() != ISD::FNEG)
4796 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4797 else
4798 RHS = RHS.getOperand(0);
4799
4800 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS);
4801 if (Res.getOpcode() != Opc)
4802 return SDValue(); // Op got folded away.
4803 if (!N0.hasOneUse())
4804 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
4805 return Res;
4806 }
4807 case ISD::FMAXNUM:
4808 case ISD::FMINNUM:
4809 case ISD::FMAXNUM_IEEE:
4810 case ISD::FMINNUM_IEEE:
4811 case ISD::FMINIMUM:
4812 case ISD::FMAXIMUM:
4815 // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)
4816 // fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y)
4817 // fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y)
4818 // fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y)
4819
4820 SDValue LHS = N0.getOperand(0);
4821 SDValue RHS = N0.getOperand(1);
4822
4823 // 0 doesn't have a negated inline immediate.
4824 // TODO: This constant check should be generalized to other operations.
4826 return SDValue();
4827
4828 SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
4829 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4830 unsigned Opposite = inverseMinMax(Opc);
4831
4832 SDValue Res = DAG.getNode(Opposite, SL, VT, NegLHS, NegRHS, N0->getFlags());
4833 if (Res.getOpcode() != Opposite)
4834 return SDValue(); // Op got folded away.
4835 if (!N0.hasOneUse())
4836 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
4837 return Res;
4838 }
4839 case AMDGPUISD::FMED3: {
4840 SDValue Ops[3];
4841 for (unsigned I = 0; I < 3; ++I)
4842 Ops[I] = DAG.getNode(ISD::FNEG, SL, VT, N0->getOperand(I), N0->getFlags());
4843
4844 SDValue Res = DAG.getNode(AMDGPUISD::FMED3, SL, VT, Ops, N0->getFlags());
4845 if (Res.getOpcode() != AMDGPUISD::FMED3)
4846 return SDValue(); // Op got folded away.
4847
4848 if (!N0.hasOneUse()) {
4849 SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Res);
4850 DAG.ReplaceAllUsesWith(N0, Neg);
4851
4852 for (SDNode *U : Neg->uses())
4853 DCI.AddToWorklist(U);
4854 }
4855
4856 return Res;
4857 }
4858 case ISD::FP_EXTEND:
4859 case ISD::FTRUNC:
4860 case ISD::FRINT:
4861 case ISD::FNEARBYINT: // XXX - Should fround be handled?
4862 case ISD::FROUNDEVEN:
4863 case ISD::FSIN:
4864 case ISD::FCANONICALIZE:
4865 case AMDGPUISD::RCP:
4868 case AMDGPUISD::SIN_HW: {
4869 SDValue CvtSrc = N0.getOperand(0);
4870 if (CvtSrc.getOpcode() == ISD::FNEG) {
4871 // (fneg (fp_extend (fneg x))) -> (fp_extend x)
4872 // (fneg (rcp (fneg x))) -> (rcp x)
4873 return DAG.getNode(Opc, SL, VT, CvtSrc.getOperand(0));
4874 }
4875
4876 if (!N0.hasOneUse())
4877 return SDValue();
4878
4879 // (fneg (fp_extend x)) -> (fp_extend (fneg x))
4880 // (fneg (rcp x)) -> (rcp (fneg x))
4881 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
4882 return DAG.getNode(Opc, SL, VT, Neg, N0->getFlags());
4883 }
4884 case ISD::FP_ROUND: {
4885 SDValue CvtSrc = N0.getOperand(0);
4886
4887 if (CvtSrc.getOpcode() == ISD::FNEG) {
4888 // (fneg (fp_round (fneg x))) -> (fp_round x)
4889 return DAG.getNode(ISD::FP_ROUND, SL, VT,
4890 CvtSrc.getOperand(0), N0.getOperand(1));
4891 }
4892
4893 if (!N0.hasOneUse())
4894 return SDValue();
4895
4896 // (fneg (fp_round x)) -> (fp_round (fneg x))
4897 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
4898 return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1));
4899 }
4900 case ISD::FP16_TO_FP: {
4901 // v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal
4902 // f16, but legalization of f16 fneg ends up pulling it out of the source.
4903 // Put the fneg back as a legal source operation that can be matched later.
4904 SDLoc SL(N);
4905
4906 SDValue Src = N0.getOperand(0);
4907 EVT SrcVT = Src.getValueType();
4908
4909 // fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000)
4910 SDValue IntFNeg = DAG.getNode(ISD::XOR, SL, SrcVT, Src,
4911 DAG.getConstant(0x8000, SL, SrcVT));
4912 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFNeg);
4913 }
4914 case ISD::SELECT: {
4915 // fneg (select c, a, b) -> select c, (fneg a), (fneg b)
4916 // TODO: Invert conditions of foldFreeOpFromSelect
4917 return SDValue();
4918 }
4919 case ISD::BITCAST: {
4920 SDLoc SL(N);
4921 SDValue BCSrc = N0.getOperand(0);
4922 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
4923 SDValue HighBits = BCSrc.getOperand(BCSrc.getNumOperands() - 1);
4924 if (HighBits.getValueType().getSizeInBits() != 32 ||
4925 !fnegFoldsIntoOp(HighBits.getNode()))
4926 return SDValue();
4927
4928 // f64 fneg only really needs to operate on the high half of of the
4929 // register, so try to force it to an f32 operation to help make use of
4930 // source modifiers.
4931 //
4932 //
4933 // fneg (f64 (bitcast (build_vector x, y))) ->
4934 // f64 (bitcast (build_vector (bitcast i32:x to f32),
4935 // (fneg (bitcast i32:y to f32)))
4936
4937 SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::f32, HighBits);
4938 SDValue NegHi = DAG.getNode(ISD::FNEG, SL, MVT::f32, CastHi);
4939 SDValue CastBack =
4940 DAG.getNode(ISD::BITCAST, SL, HighBits.getValueType(), NegHi);
4941
4942 SmallVector<SDValue, 8> Ops(BCSrc->op_begin(), BCSrc->op_end());
4943 Ops.back() = CastBack;
4944 DCI.AddToWorklist(NegHi.getNode());
4945 SDValue Build =
4946 DAG.getNode(ISD::BUILD_VECTOR, SL, BCSrc.getValueType(), Ops);
4947 SDValue Result = DAG.getNode(ISD::BITCAST, SL, VT, Build);
4948
4949 if (!N0.hasOneUse())
4950 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Result));
4951 return Result;
4952 }
4953
4954 if (BCSrc.getOpcode() == ISD::SELECT && VT == MVT::f32 &&
4955 BCSrc.hasOneUse()) {
4956 // fneg (bitcast (f32 (select cond, i32:lhs, i32:rhs))) ->
4957 // select cond, (bitcast i32:lhs to f32), (bitcast i32:rhs to f32)
4958
4959 // TODO: Cast back result for multiple uses is beneficial in some cases.
4960
4961 SDValue LHS =
4962 DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(1));
4963 SDValue RHS =
4964 DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(2));
4965
4966 SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, LHS);
4967 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHS);
4968
4969 return DAG.getNode(ISD::SELECT, SL, MVT::f32, BCSrc.getOperand(0), NegLHS,
4970 NegRHS);
4971 }
4972
4973 return SDValue();
4974 }
4975 default:
4976 return SDValue();
4977 }
4978}
4979
4981 DAGCombinerInfo &DCI) const {
4982 SelectionDAG &DAG = DCI.DAG;
4983 SDValue N0 = N->getOperand(0);
4984
4985 if (!N0.hasOneUse())
4986 return SDValue();
4987
4988 switch (N0.getOpcode()) {
4989 case ISD::FP16_TO_FP: {
4990 assert(!Subtarget->has16BitInsts() && "should only see if f16 is illegal");
4991 SDLoc SL(N);
4992 SDValue Src = N0.getOperand(0);
4993 EVT SrcVT = Src.getValueType();
4994
4995 // fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff)
4996 SDValue IntFAbs = DAG.getNode(ISD::AND, SL, SrcVT, Src,
4997 DAG.getConstant(0x7fff, SL, SrcVT));
4998 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFAbs);
4999 }
5000 default:
5001 return SDValue();
5002 }
5003}
5004
5006 DAGCombinerInfo &DCI) const {
5007 const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
5008 if (!CFP)
5009 return SDValue();
5010
5011 // XXX - Should this flush denormals?
5012 const APFloat &Val = CFP->getValueAPF();
5013 APFloat One(Val.getSemantics(), "1.0");
5014 return DCI.DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0));
5015}
5016
5018 DAGCombinerInfo &DCI) const {
5019 SelectionDAG &DAG = DCI.DAG;
5020 SDLoc DL(N);
5021
5022 switch(N->getOpcode()) {
5023 default:
5024 break;
5025 case ISD::BITCAST: {
5026 EVT DestVT = N->getValueType(0);
5027
5028 // Push casts through vector builds. This helps avoid emitting a large
5029 // number of copies when materializing floating point vector constants.
5030 //
5031 // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) =>
5032 // vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y))
5033 if (DestVT.isVector()) {
5034 SDValue Src = N->getOperand(0);
5035 if (Src.getOpcode() == ISD::BUILD_VECTOR &&
5038 EVT SrcVT = Src.getValueType();
5039 unsigned NElts = DestVT.getVectorNumElements();
5040
5041 if (SrcVT.getVectorNumElements() == NElts) {
5042 EVT DestEltVT = DestVT.getVectorElementType();
5043
5044 SmallVector<SDValue, 8> CastedElts;
5045 SDLoc SL(N);
5046 for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) {
5047 SDValue Elt = Src.getOperand(I);
5048 CastedElts.push_back(DAG.getNode(ISD::BITCAST, DL, DestEltVT, Elt));
5049 }
5050
5051 return DAG.getBuildVector(DestVT, SL, CastedElts);
5052 }
5053 }
5054 }
5055
5056 if (DestVT.getSizeInBits() != 64 || !DestVT.isVector())
5057 break;
5058
5059 // Fold bitcasts of constants.
5060 //
5061 // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k)
5062 // TODO: Generalize and move to DAGCombiner
5063 SDValue Src = N->getOperand(0);
5064 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src)) {
5065 SDLoc SL(N);
5066 uint64_t CVal = C->getZExtValue();
5067 SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
5068 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
5069 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
5070 return DAG.getNode(ISD::BITCAST, SL, DestVT, BV);
5071 }
5072
5073 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Src)) {
5074 const APInt &Val = C->getValueAPF().bitcastToAPInt();
5075 SDLoc SL(N);
5076 uint64_t CVal = Val.getZExtValue();
5077 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
5078 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
5079 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
5080
5081 return DAG.getNode(ISD::BITCAST, SL, DestVT, Vec);
5082 }
5083
5084 break;
5085 }
5086 case ISD::SHL: {
5088 break;
5089
5090 return performShlCombine(N, DCI);
5091 }
5092 case ISD::SRL: {
5094 break;
5095
5096 return performSrlCombine(N, DCI);
5097 }
5098 case ISD::SRA: {
5100 break;
5101
5102 return performSraCombine(N, DCI);
5103 }
5104 case ISD::TRUNCATE:
5105 return performTruncateCombine(N, DCI);
5106 case ISD::MUL:
5107 return performMulCombine(N, DCI);
5108 case AMDGPUISD::MUL_U24:
5109 case AMDGPUISD::MUL_I24: {
5110 if (SDValue Simplified = simplifyMul24(N, DCI))
5111 return Simplified;
5112 break;
5113 }
5116 return simplifyMul24(N, DCI);
5117 case ISD::SMUL_LOHI:
5118 case ISD::UMUL_LOHI:
5119 return performMulLoHiCombine(N, DCI);
5120 case ISD::MULHS:
5121 return performMulhsCombine(N, DCI);
5122 case ISD::MULHU:
5123 return performMulhuCombine(N, DCI);
5124 case ISD::SELECT:
5125 return performSelectCombine(N, DCI);
5126 case ISD::FNEG:
5127 return performFNegCombine(N, DCI);
5128 case ISD::FABS:
5129 return performFAbsCombine(N, DCI);
5130 case AMDGPUISD::BFE_I32:
5131 case AMDGPUISD::BFE_U32: {
5132 assert(!N->getValueType(0).isVector() &&
5133 "Vector handling of BFE not implemented");
5134 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
5135 if (!Width)
5136 break;
5137
5138 uint32_t WidthVal = Width->getZExtValue() & 0x1f;
5139 if (WidthVal == 0)
5140 return DAG.getConstant(0, DL, MVT::i32);
5141
5142 ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
5143 if (!Offset)
5144 break;
5145
5146 SDValue BitsFrom = N->getOperand(0);
5147 uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;
5148
5149 bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
5150
5151 if (OffsetVal == 0) {
5152 // This is already sign / zero extended, so try to fold away extra BFEs.
5153 unsigned SignBits = Signed ? (32 - WidthVal + 1) : (32 - WidthVal);
5154
5155 unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom);
5156 if (OpSignBits >= SignBits)
5157 return BitsFrom;
5158
5159 EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal);
5160 if (Signed) {
5161 // This is a sign_extend_inreg. Replace it to take advantage of existing
5162 // DAG Combines. If not eliminated, we will match back to BFE during
5163 // selection.
5164
5165 // TODO: The sext_inreg of extended types ends, although we can could
5166 // handle them in a single BFE.
5167 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom,
5168 DAG.getValueType(SmallVT));
5169 }
5170
5171 return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT);
5172 }
5173
5174 if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(BitsFrom)) {
5175 if (Signed) {
5176 return constantFoldBFE<int32_t>(DAG,
5177 CVal->getSExtValue(),
5178 OffsetVal,
5179 WidthVal,
5180 DL);
5181 }
5182
5183 return constantFoldBFE<uint32_t>(DAG,
5184 CVal->getZExtValue(),
5185 OffsetVal,
5186 WidthVal,
5187 DL);
5188 }
5189
5190 if ((OffsetVal + WidthVal) >= 32 &&
5191 !(Subtarget->hasSDWA() && OffsetVal == 16 && WidthVal == 16)) {
5192 SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32);
5193 return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
5194 BitsFrom, ShiftVal);
5195 }
5196
5197 if (BitsFrom.hasOneUse()) {
5198 APInt Demanded = APInt::getBitsSet(32,
5199 OffsetVal,
5200 OffsetVal + WidthVal);
5201
5202 KnownBits Known;
5204 !DCI.isBeforeLegalizeOps());
5205 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
5206 if (TLI.ShrinkDemandedConstant(BitsFrom, Demanded, TLO) ||
5207 TLI.SimplifyDemandedBits(BitsFrom, Demanded, Known, TLO)) {
5208 DCI.CommitTargetLoweringOpt(TLO);
5209 }
5210 }
5211
5212 break;
5213 }
5214 case ISD::LOAD:
5215 return performLoadCombine(N, DCI);
5216 case ISD::STORE:
5217 return performStoreCombine(N, DCI);
5218 case AMDGPUISD::RCP:
5220 return performRcpCombine(N, DCI);
5221 case ISD::AssertZext:
5222 case ISD::AssertSext:
5223 return performAssertSZExtCombine(N, DCI);
5225 return performIntrinsicWOChainCombine(N, DCI);
5226 case AMDGPUISD::FMAD_FTZ: {
5227 SDValue N0 = N->getOperand(0);
5228 SDValue N1 = N->getOperand(1);
5229 SDValue N2 = N->getOperand(2);
5230 EVT VT = N->getValueType(0);
5231
5232 // FMAD_FTZ is a FMAD + flush denormals to zero.
5233 // We flush the inputs, the intermediate step, and the output.
5234 ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
5235 ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
5236 ConstantFPSDNode *N2CFP = dyn_cast<ConstantFPSDNode>(N2);
5237 if (N0CFP && N1CFP && N2CFP) {
5238 const auto FTZ = [](const APFloat &V) {
5239 if (V.isDenormal()) {
5240 APFloat Zero(V.getSemantics(), 0);
5241 return V.isNegative() ? -Zero : Zero;
5242 }
5243 return V;
5244 };
5245
5246 APFloat V0 = FTZ(N0CFP->getValueAPF());
5247 APFloat V1 = FTZ(N1CFP->getValueAPF());
5248 APFloat V2 = FTZ(N2CFP->getValueAPF());
5250 V0 = FTZ(V0);
5252 return DAG.getConstantFP(FTZ(V0), DL, VT);
5253 }
5254 break;
5255 }
5256 }
5257 return SDValue();
5258}
5259
5260//===----------------------------------------------------------------------===//
5261// Helper functions
5262//===----------------------------------------------------------------------===//
5263
5265 const TargetRegisterClass *RC,
5266 Register Reg, EVT VT,
5267 const SDLoc &SL,
5268 bool RawReg) const {
5271 Register VReg;
5272
5273 if (!MRI.isLiveIn(Reg)) {
5274 VReg = MRI.createVirtualRegister(RC);
5275 MRI.addLiveIn(Reg, VReg);
5276 } else {
5277 VReg = MRI.getLiveInVirtReg(Reg);
5278 }
5279
5280 if (RawReg)
5281 return DAG.getRegister(VReg, VT);
5282
5283 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT);
5284}
5285
5286// This may be called multiple times, and nothing prevents creating multiple
5287// objects at the same offset. See if we already defined this object.
5289 int64_t Offset) {
5290 for (int I = MFI.getObjectIndexBegin(); I < 0; ++I) {
5291 if (MFI.getObjectOffset(I) == Offset) {
5292 assert(MFI.getObjectSize(I) == Size);
5293 return I;
5294 }
5295 }
5296
5297 return MFI.CreateFixedObject(Size, Offset, true);
5298}
5299
5301 EVT VT,
5302 const SDLoc &SL,
5303 int64_t Offset) const {
5305 MachineFrameInfo &MFI = MF.getFrameInfo();
5306 int FI = getOrCreateFixedStackObject(MFI, VT.getStoreSize(), Offset);
5307
5308 auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset);
5309 SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32);
5310
5311 return DAG.getLoad(VT, SL, DAG.getEntryNode(), Ptr, SrcPtrInfo, Align(4),
5314}
5315
5317 const SDLoc &SL,
5318 SDValue Chain,
5319 SDValue ArgVal,
5320 int64_t Offset) const {
5324
5325 SDValue Ptr = DAG.getConstant(Offset, SL, MVT::i32);
5326 // Stores to the argument stack area are relative to the stack pointer.
5327 SDValue SP =
5328 DAG.getCopyFromReg(Chain, SL, Info->getStackPtrOffsetReg(), MVT::i32);
5329 Ptr = DAG.getNode(ISD::ADD, SL, MVT::i32, SP, Ptr);
5330 SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, Align(4),
5332 return Store;
5333}
5334
5336 const TargetRegisterClass *RC,
5337 EVT VT, const SDLoc &SL,
5338 const ArgDescriptor &Arg) const {
5339 assert(Arg && "Attempting to load missing argument");
5340
5341 SDValue V = Arg.isRegister() ?
5342 CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL) :
5343 loadStackInputValue(DAG, VT, SL, Arg.getStackOffset());
5344
5345 if (!Arg.isMasked())
5346 return V;
5347
5348 unsigned Mask = Arg.getMask();
5349 unsigned Shift = llvm::countr_zero<unsigned>(Mask);
5350 V = DAG.getNode(ISD::SRL, SL, VT, V,
5351 DAG.getShiftAmountConstant(Shift, VT, SL));
5352 return DAG.getNode(ISD::AND, SL, VT, V,
5353 DAG.getConstant(Mask >> Shift, SL, VT));
5354}
5355
5357 uint64_t ExplicitKernArgSize, const ImplicitParameter Param) const {
5358 unsigned ExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
5359 const Align Alignment = Subtarget->getAlignmentForImplicitArgPtr();
5360 uint64_t ArgOffset =
5361 alignTo(ExplicitKernArgSize, Alignment) + ExplicitArgOffset;
5362 switch (Param) {
5363 case FIRST_IMPLICIT:
5364 return ArgOffset;
5365 case PRIVATE_BASE:
5367 case SHARED_BASE:
5368 return ArgOffset + AMDGPU::ImplicitArg::SHARED_BASE_OFFSET;
5369 case QUEUE_PTR:
5370 return ArgOffset + AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET;
5371 }
5372 llvm_unreachable("unexpected implicit parameter type");
5373}
5374
5376 const MachineFunction &MF, const ImplicitParameter Param) const {
5379}
5380
5381#define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node;
5382
5383const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
5384 switch ((AMDGPUISD::NodeType)Opcode) {
5385 case AMDGPUISD::FIRST_NUMBER: break;
5386 // AMDIL DAG nodes
5387 NODE_NAME_CASE(UMUL);
5388 NODE_NAME_CASE(BRANCH_COND);
5389
5390 // AMDGPU DAG nodes
5391 NODE_NAME_CASE(IF)
5392 NODE_NAME_CASE(ELSE)
5393 NODE_NAME_CASE(LOOP)
5394 NODE_NAME_CASE(CALL)
5395 NODE_NAME_CASE(TC_RETURN)
5396 NODE_NAME_CASE(TC_RETURN_GFX)
5397 NODE_NAME_CASE(TC_RETURN_CHAIN)
5398 NODE_NAME_CASE(TRAP)
5399 NODE_NAME_CASE(RET_GLUE)
5400 NODE_NAME_CASE(WAVE_ADDRESS)
5401 NODE_NAME_CASE(RETURN_TO_EPILOG)
5402 NODE_NAME_CASE(ENDPGM)
5403 NODE_NAME_CASE(ENDPGM_TRAP)
5404 NODE_NAME_CASE(SIMULATED_TRAP)
5405 NODE_NAME_CASE(DWORDADDR)
5406 NODE_NAME_CASE(FRACT)
5407 NODE_NAME_CASE(SETCC)
5408 NODE_NAME_CASE(SETREG)
5409 NODE_NAME_CASE(DENORM_MODE)
5410 NODE_NAME_CASE(FMA_W_CHAIN)
5411 NODE_NAME_CASE(FMUL_W_CHAIN)
5412 NODE_NAME_CASE(CLAMP)
5413 NODE_NAME_CASE(COS_HW)
5414 NODE_NAME_CASE(SIN_HW)
5415 NODE_NAME_CASE(FMAX_LEGACY)
5416 NODE_NAME_CASE(FMIN_LEGACY)
5417 NODE_NAME_CASE(FMAX3)
5418 NODE_NAME_CASE(SMAX3)
5419 NODE_NAME_CASE(UMAX3)
5420 NODE_NAME_CASE(FMIN3)
5421 NODE_NAME_CASE(SMIN3)
5422 NODE_NAME_CASE(UMIN3)
5423 NODE_NAME_CASE(FMED3)
5424 NODE_NAME_CASE(SMED3)
5425 NODE_NAME_CASE(UMED3)
5426 NODE_NAME_CASE(FMAXIMUM3)
5427 NODE_NAME_CASE(FMINIMUM3)
5428 NODE_NAME_CASE(FDOT2)
5429 NODE_NAME_CASE(URECIP)
5430 NODE_NAME_CASE(DIV_SCALE)
5431 NODE_NAME_CASE(DIV_FMAS)
5432 NODE_NAME_CASE(DIV_FIXUP)
5433 NODE_NAME_CASE(FMAD_FTZ)
5434 NODE_NAME_CASE(RCP)
5435 NODE_NAME_CASE(RSQ)
5436 NODE_NAME_CASE(RCP_LEGACY)
5437 NODE_NAME_CASE(RCP_IFLAG)
5438 NODE_NAME_CASE(LOG)
5439 NODE_NAME_CASE(EXP)
5440 NODE_NAME_CASE(FMUL_LEGACY)
5441 NODE_NAME_CASE(RSQ_CLAMP)
5442 NODE_NAME_CASE(FP_CLASS)
5443 NODE_NAME_CASE(DOT4)
5444 NODE_NAME_CASE(CARRY)
5445 NODE_NAME_CASE(BORROW)
5446 NODE_NAME_CASE(BFE_U32)
5447 NODE_NAME_CASE(BFE_I32)
5448 NODE_NAME_CASE(BFI)
5449 NODE_NAME_CASE(BFM)
5450 NODE_NAME_CASE(FFBH_U32)
5451 NODE_NAME_CASE(FFBH_I32)
5452 NODE_NAME_CASE(FFBL_B32)
5453 NODE_NAME_CASE(MUL_U24)
5454 NODE_NAME_CASE(MUL_I24)
5455 NODE_NAME_CASE(MULHI_U24)
5456 NODE_NAME_CASE(MULHI_I24)
5457 NODE_NAME_CASE(MAD_U24)
5458 NODE_NAME_CASE(MAD_I24)
5459 NODE_NAME_CASE(MAD_I64_I32)
5460 NODE_NAME_CASE(MAD_U64_U32)
5461 NODE_NAME_CASE(PERM)
5462 NODE_NAME_CASE(TEXTURE_FETCH)
5463 NODE_NAME_CASE(R600_EXPORT)
5464 NODE_NAME_CASE(CONST_ADDRESS)
5465 NODE_NAME_CASE(REGISTER_LOAD)
5466 NODE_NAME_CASE(REGISTER_STORE)
5467 NODE_NAME_CASE(SAMPLE)
5468 NODE_NAME_CASE(SAMPLEB)
5469 NODE_NAME_CASE(SAMPLED)
5470 NODE_NAME_CASE(SAMPLEL)
5471 NODE_NAME_CASE(CVT_F32_UBYTE0)
5472 NODE_NAME_CASE(CVT_F32_UBYTE1)
5473 NODE_NAME_CASE(CVT_F32_UBYTE2)
5474 NODE_NAME_CASE(CVT_F32_UBYTE3)
5475 NODE_NAME_CASE(CVT_PKRTZ_F16_F32)
5476 NODE_NAME_CASE(CVT_PKNORM_I16_F32)
5477 NODE_NAME_CASE(CVT_PKNORM_U16_F32)
5478 NODE_NAME_CASE(CVT_PK_I16_I32)
5479 NODE_NAME_CASE(CVT_PK_U16_U32)
5480 NODE_NAME_CASE(FP_TO_FP16)
5481 NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
5482 NODE_NAME_CASE(CONST_DATA_PTR)
5483 NODE_NAME_CASE(PC_ADD_REL_OFFSET)
5485 NODE_NAME_CASE(FPTRUNC_ROUND_UPWARD)
5486 NODE_NAME_CASE(FPTRUNC_ROUND_DOWNWARD)
5487 NODE_NAME_CASE(DUMMY_CHAIN)
5489 NODE_NAME_CASE(LOAD_D16_HI)
5490 NODE_NAME_CASE(LOAD_D16_LO)
5491 NODE_NAME_CASE(LOAD_D16_HI_I8)
5492 NODE_NAME_CASE(LOAD_D16_HI_U8)
5493 NODE_NAME_CASE(LOAD_D16_LO_I8)
5494 NODE_NAME_CASE(LOAD_D16_LO_U8)
5495 NODE_NAME_CASE(STORE_MSKOR)
5496 NODE_NAME_CASE(LOAD_CONSTANT)
5497 NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
5498 NODE_NAME_CASE(TBUFFER_STORE_FORMAT_D16)
5499 NODE_NAME_CASE(TBUFFER_LOAD_FORMAT)
5500 NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16)
5501 NODE_NAME_CASE(DS_ORDERED_COUNT)
5502 NODE_NAME_CASE(ATOMIC_CMP_SWAP)
5503 NODE_NAME_CASE(ATOMIC_LOAD_FMIN)
5504 NODE_NAME_CASE(ATOMIC_LOAD_FMAX)
5505 NODE_NAME_CASE(BUFFER_LOAD)
5506 NODE_NAME_CASE(BUFFER_LOAD_UBYTE)
5507 NODE_NAME_CASE(BUFFER_LOAD_USHORT)
5508 NODE_NAME_CASE(BUFFER_LOAD_BYTE)
5509 NODE_NAME_CASE(BUFFER_LOAD_SHORT)
5510 NODE_NAME_CASE(BUFFER_LOAD_FORMAT)
5511 NODE_NAME_CASE(BUFFER_LOAD_FORMAT_TFE)
5512 NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16)
5513 NODE_NAME_CASE(SBUFFER_LOAD)
5514 NODE_NAME_CASE(SBUFFER_LOAD_BYTE)
5515 NODE_NAME_CASE(SBUFFER_LOAD_UBYTE)
5516 NODE_NAME_CASE(SBUFFER_LOAD_SHORT)
5517 NODE_NAME_CASE(SBUFFER_LOAD_USHORT)
5518 NODE_NAME_CASE(BUFFER_STORE)
5519 NODE_NAME_CASE(BUFFER_STORE_BYTE)
5520 NODE_NAME_CASE(BUFFER_STORE_SHORT)
5521 NODE_NAME_CASE(BUFFER_STORE_FORMAT)
5522 NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16)
5523 NODE_NAME_CASE(BUFFER_ATOMIC_SWAP)
5524 NODE_NAME_CASE(BUFFER_ATOMIC_ADD)
5525 NODE_NAME_CASE(BUFFER_ATOMIC_SUB)
5526 NODE_NAME_CASE(BUFFER_ATOMIC_SMIN)
5527 NODE_NAME_CASE(BUFFER_ATOMIC_UMIN)
5528 NODE_NAME_CASE(BUFFER_ATOMIC_SMAX)
5529 NODE_NAME_CASE(BUFFER_ATOMIC_UMAX)
5530 NODE_NAME_CASE(BUFFER_ATOMIC_AND)
5531 NODE_NAME_CASE(BUFFER_ATOMIC_OR)
5532 NODE_NAME_CASE(BUFFER_ATOMIC_XOR)
5533 NODE_NAME_CASE(BUFFER_ATOMIC_INC)
5534 NODE_NAME_CASE(BUFFER_ATOMIC_DEC)
5535 NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP)
5536 NODE_NAME_CASE(BUFFER_ATOMIC_CSUB)
5537 NODE_NAME_CASE(BUFFER_ATOMIC_FADD)
5538 NODE_NAME_CASE(BUFFER_ATOMIC_FADD_BF16)
5539 NODE_NAME_CASE(BUFFER_ATOMIC_FMIN)
5540 NODE_NAME_CASE(BUFFER_ATOMIC_FMAX)
5541 NODE_NAME_CASE(BUFFER_ATOMIC_COND_SUB_U32)
5542
5544 }
5545 return nullptr;
5546}
5547
5549 SelectionDAG &DAG, int Enabled,
5550 int &RefinementSteps,
5551 bool &UseOneConstNR,
5552 bool Reciprocal) const {
5553 EVT VT = Operand.getValueType();
5554
5555 if (VT == MVT::f32) {
5556 RefinementSteps = 0;
5557 return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand);
5558 }
5559
5560 // TODO: There is also f64 rsq instruction, but the documentation is less
5561 // clear on its precision.
5562
5563 return SDValue();
5564}
5565
5567 SelectionDAG &DAG, int Enabled,
5568 int &RefinementSteps) const {
5569 EVT VT = Operand.getValueType();
5570
5571 if (VT == MVT::f32) {
5572 // Reciprocal, < 1 ulp error.
5573 //
5574 // This reciprocal approximation converges to < 0.5 ulp error with one
5575 // newton rhapson performed with two fused multiple adds (FMAs).
5576
5577 RefinementSteps = 0;
5578 return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand);
5579 }
5580
5581 // TODO: There is also f64 rcp instruction, but the documentation is less
5582 // clear on its precision.
5583
5584 return SDValue();
5585}
5586
5587static unsigned workitemIntrinsicDim(unsigned ID) {
5588 switch (ID) {
5589 case Intrinsic::amdgcn_workitem_id_x:
5590 return 0;
5591 case Intrinsic::amdgcn_workitem_id_y:
5592 return 1;
5593 case Intrinsic::amdgcn_workitem_id_z:
5594 return 2;
5595 default:
5596 llvm_unreachable("not a workitem intrinsic");
5597 }
5598}
5599
5601 const SDValue Op, KnownBits &Known,
5602 const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
5603
5604 Known.resetAll(); // Don't know anything.
5605
5606 unsigned Opc = Op.getOpcode();
5607
5608 switch (Opc) {
5609 default:
5610 break;
5611 case AMDGPUISD::CARRY:
5612 case AMDGPUISD::BORROW: {
5613 Known.Zero = APInt::getHighBitsSet(32, 31);
5614 break;
5615 }
5616
5617 case AMDGPUISD::BFE_I32:
5618 case AMDGPUISD::BFE_U32: {
5619 ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5620 if (!CWidth)
5621 return;
5622
5623 uint32_t Width = CWidth->getZExtValue() & 0x1f;
5624
5625 if (Opc == AMDGPUISD::BFE_U32)
5626 Known.Zero = APInt::getHighBitsSet(32, 32 - Width);
5627
5628 break;
5629 }
5630 case AMDGPUISD::FP_TO_FP16: {
5631 unsigned BitWidth = Known.getBitWidth();
5632
5633 // High bits are zero.
5635 break;
5636 }
5637 case AMDGPUISD::MUL_U24:
5638 case AMDGPUISD::MUL_I24: {
5639 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
5640 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
5641 unsigned TrailZ = LHSKnown.countMinTrailingZeros() +
5642 RHSKnown.countMinTrailingZeros();
5643 Known.Zero.setLowBits(std::min(TrailZ, 32u));
5644 // Skip extra check if all bits are known zeros.
5645 if (TrailZ >= 32)
5646 break;
5647
5648 // Truncate to 24 bits.
5649 LHSKnown = LHSKnown.trunc(24);
5650 RHSKnown = RHSKnown.trunc(24);
5651
5652 if (Opc == AMDGPUISD::MUL_I24) {
5653 unsigned LHSValBits = LHSKnown.countMaxSignificantBits();
5654 unsigned RHSValBits = RHSKnown.countMaxSignificantBits();
5655 unsigned MaxValBits = LHSValBits + RHSValBits;
5656 if (MaxValBits > 32)
5657 break;
5658 unsigned SignBits = 32 - MaxValBits + 1;
5659 bool LHSNegative = LHSKnown.isNegative();
5660 bool LHSNonNegative = LHSKnown.isNonNegative();
5661 bool LHSPositive = LHSKnown.isStrictlyPositive();
5662 bool RHSNegative = RHSKnown.isNegative();
5663 bool RHSNonNegative = RHSKnown.isNonNegative();
5664 bool RHSPositive = RHSKnown.isStrictlyPositive();
5665
5666 if ((LHSNonNegative && RHSNonNegative) || (LHSNegative && RHSNegative))
5667 Known.Zero.setHighBits(SignBits);
5668 else if ((LHSNegative && RHSPositive) || (LHSPositive && RHSNegative))
5669 Known.One.setHighBits(SignBits);
5670 } else {
5671 unsigned LHSValBits = LHSKnown.countMaxActiveBits();
5672 unsigned RHSValBits = RHSKnown.countMaxActiveBits();
5673 unsigned MaxValBits = LHSValBits + RHSValBits;
5674 if (MaxValBits >= 32)
5675 break;
5676 Known.Zero.setBitsFrom(MaxValBits);
5677 }
5678 break;
5679 }
5680 case AMDGPUISD::PERM: {
5681 ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5682 if (!CMask)
5683 return;
5684
5685 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
5686 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
5687 unsigned Sel = CMask->getZExtValue();
5688
5689 for (unsigned I = 0; I < 32; I += 8) {
5690 unsigned SelBits = Sel & 0xff;
5691 if (SelBits < 4) {
5692 SelBits *= 8;
5693 Known.One |= ((RHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
5694 Known.Zero |= ((RHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
5695 } else if (SelBits < 7) {
5696 SelBits = (SelBits & 3) * 8;
5697 Known.One |= ((LHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
5698 Known.Zero |= ((LHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
5699 } else if (SelBits == 0x0c) {
5700 Known.Zero |= 0xFFull << I;
5701 } else if (SelBits > 0x0c) {
5702 Known.One |= 0xFFull << I;
5703 }
5704 Sel >>= 8;
5705 }
5706 break;
5707 }
5709 Known.Zero.setHighBits(24);
5710 break;
5711 }
5713 Known.Zero.setHighBits(16);
5714 break;
5715 }
5716 case AMDGPUISD::LDS: {
5717 auto GA = cast<GlobalAddressSDNode>(Op.getOperand(0).getNode());
5718 Align Alignment = GA->getGlobal()->getPointerAlignment(DAG.getDataLayout());
5719
5720 Known.Zero.setHighBits(16);
5721 Known.Zero.setLowBits(Log2(Alignment));
5722 break;
5723 }
5724 case AMDGPUISD::SMIN3:
5725 case AMDGPUISD::SMAX3:
5726 case AMDGPUISD::SMED3:
5727 case AMDGPUISD::UMIN3:
5728 case AMDGPUISD::UMAX3:
5729 case AMDGPUISD::UMED3: {
5730 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
5731 if (Known2.isUnknown())
5732 break;
5733
5734 KnownBits Known1 = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
5735 if (Known1.isUnknown())
5736 break;
5737
5738 KnownBits Known0 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
5739 if (Known0.isUnknown())
5740 break;
5741
5742 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
5743 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
5744 Known.One = Known0.One & Known1.One & Known2.One;
5745 break;
5746 }
5748 unsigned IID = Op.getConstantOperandVal(0);
5749 switch (IID) {
5750 case Intrinsic::amdgcn_workitem_id_x:
5751 case Intrinsic::amdgcn_workitem_id_y:
5752 case Intrinsic::amdgcn_workitem_id_z: {
5753 unsigned MaxValue = Subtarget->getMaxWorkitemID(
5755 Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
5756 break;
5757 }
5758 default:
5759 break;
5760 }
5761 }
5762 }
5763}
5764
5766 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
5767 unsigned Depth) const {
5768 switch (Op.getOpcode()) {
5769 case AMDGPUISD::BFE_I32: {
5770 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5771 if (!Width)
5772 return 1;
5773
5774 unsigned SignBits = 32 - Width->getZExtValue() + 1;
5775 if (!isNullConstant(Op.getOperand(1)))
5776 return SignBits;
5777
5778 // TODO: Could probably figure something out with non-0 offsets.
5779 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
5780 return std::max(SignBits, Op0SignBits);
5781 }
5782
5783 case AMDGPUISD::BFE_U32: {
5784 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5785 return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1;
5786 }
5787
5788 case AMDGPUISD::CARRY:
5789 case AMDGPUISD::BORROW:
5790 return 31;
5792 return 25;
5794 return 17;
5796 return 24;
5798 return 16;
5800 return 16;
5801 case AMDGPUISD::SMIN3:
5802 case AMDGPUISD::SMAX3:
5803 case AMDGPUISD::SMED3:
5804 case AMDGPUISD::UMIN3:
5805 case AMDGPUISD::UMAX3:
5806 case AMDGPUISD::UMED3: {
5807 unsigned Tmp2 = DAG.ComputeNumSignBits(Op.getOperand(2), Depth + 1);
5808 if (Tmp2 == 1)
5809 return 1; // Early out.
5810
5811 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth + 1);
5812 if (Tmp1 == 1)
5813 return 1; // Early out.
5814
5815 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
5816 if (Tmp0 == 1)
5817 return 1; // Early out.
5818
5819 return std::min(Tmp0, std::min(Tmp1, Tmp2));
5820 }
5821 default:
5822 return 1;
5823 }
5824}
5825
5828 const APInt &DemandedElts, const MachineRegisterInfo &MRI,
5829 unsigned Depth) const {
5830 const MachineInstr *MI = MRI.getVRegDef(R);
5831 if (!MI)
5832 return 1;
5833
5834 // TODO: Check range metadata on MMO.
5835 switch (MI->getOpcode()) {
5836 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
5837 return 25;
5838 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
5839 return 17;
5840 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
5841 return 24;
5842 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
5843 return 16;
5844 case AMDGPU::G_AMDGPU_SMED3:
5845 case AMDGPU::G_AMDGPU_UMED3: {
5846 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
5847 unsigned Tmp2 = Analysis.computeNumSignBits(Src2, DemandedElts, Depth + 1);
5848 if (Tmp2 == 1)
5849 return 1;
5850 unsigned Tmp1 = Analysis.computeNumSignBits(Src1, DemandedElts, Depth + 1);
5851 if (Tmp1 == 1)
5852 return 1;
5853 unsigned Tmp0 = Analysis.computeNumSignBits(Src0, DemandedElts, Depth + 1);
5854 if (Tmp0 == 1)
5855 return 1;
5856 return std::min(Tmp0, std::min(Tmp1, Tmp2));
5857 }
5858 default:
5859 return 1;
5860 }
5861}
5862
5864 const SelectionDAG &DAG,
5865 bool SNaN,
5866 unsigned Depth) const {
5867 unsigned Opcode = Op.getOpcode();
5868 switch (Opcode) {
5871 if (SNaN)
5872 return true;
5873
5874 // TODO: Can check no nans on one of the operands for each one, but which
5875 // one?
5876 return false;
5877 }
5880 if (SNaN)
5881 return true;
5882 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
5883 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
5884 }
5885 case AMDGPUISD::FMED3:
5886 case AMDGPUISD::FMIN3:
5887 case AMDGPUISD::FMAX3:
5890 case AMDGPUISD::FMAD_FTZ: {
5891 if (SNaN)
5892 return true;
5893 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
5894 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
5895 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
5896 }
5901 return true;
5902
5903 case AMDGPUISD::RCP:
5904 case AMDGPUISD::RSQ:
5906 case AMDGPUISD::RSQ_CLAMP: {
5907 if (SNaN)
5908 return true;
5909
5910 // TODO: Need is known positive check.
5911 return false;
5912 }
5913 case ISD::FLDEXP:
5914 case AMDGPUISD::FRACT: {
5915 if (SNaN)
5916 return true;
5917 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
5918 }
5922 // TODO: Refine on operands.
5923 return SNaN;
5924 case AMDGPUISD::SIN_HW:
5925 case AMDGPUISD::COS_HW: {
5926 // TODO: Need check for infinity
5927 return SNaN;
5928 }
5930 unsigned IntrinsicID = Op.getConstantOperandVal(0);
5931 // TODO: Handle more intrinsics
5932 switch (IntrinsicID) {
5933 case Intrinsic::amdgcn_cubeid:
5934 return true;
5935
5936 case Intrinsic::amdgcn_frexp_mant: {
5937 if (SNaN)
5938 return true;
5939 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
5940 }
5941 case Intrinsic::amdgcn_cvt_pkrtz: {
5942 if (SNaN)
5943 return true;
5944 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
5945 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
5946 }
5947 case Intrinsic::amdgcn_rcp:
5948 case Intrinsic::amdgcn_rsq:
5949 case Intrinsic::amdgcn_rcp_legacy:
5950 case Intrinsic::amdgcn_rsq_legacy:
5951 case Intrinsic::amdgcn_rsq_clamp: {
5952 if (SNaN)
5953 return true;
5954
5955 // TODO: Need is known positive check.
5956 return false;
5957 }
5958 case Intrinsic::amdgcn_trig_preop:
5959 case Intrinsic::amdgcn_fdot2:
5960 // TODO: Refine on operand
5961 return SNaN;
5962 case Intrinsic::amdgcn_fma_legacy:
5963 if (SNaN)
5964 return true;
5965 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
5966 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1) &&
5967 DAG.isKnownNeverNaN(Op.getOperand(3), SNaN, Depth + 1);
5968 default:
5969 return false;
5970 }
5971 }
5972 default:
5973 return false;
5974 }
5975}
5976
5978 Register N0, Register N1) const {
5979 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
5980}
5981
5984 switch (RMW->getOperation()) {
5991 default: {
5992 if (auto *IntTy = dyn_cast<IntegerType>(RMW->getType())) {
5993 unsigned Size = IntTy->getBitWidth();
5994 if (Size == 32 || Size == 64)
5996 }
5997
5999 }
6000 }
6001}
6002
6003/// Whether it is profitable to sink the operands of an
6004/// Instruction I to the basic block of I.
6005/// This helps using several modifiers (like abs and neg) more often.
6007 Instruction *I, SmallVectorImpl<Use *> &Ops) const {
6008 using namespace PatternMatch;
6009
6010 for (auto &Op : I->operands()) {
6011 // Ensure we are not already sinking this operand.
6012 if (any_of(Ops, [&](Use *U) { return U->get() == Op.get(); }))
6013 continue;
6014
6015 if (match(&Op, m_FAbs(m_Value())) || match(&Op, m_FNeg(m_Value())))
6016 Ops.push_back(&Op);
6017 }
6018
6019 return !Ops.empty();
6020}
unsigned const MachineRegisterInfo * MRI
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static LLVM_READONLY bool hasSourceMods(const MachineInstr &MI)
static bool isInv2Pi(const APFloat &APF)
static LLVM_READONLY bool opMustUseVOP3Encoding(const MachineInstr &MI, const MachineRegisterInfo &MRI)
returns true if the operation will definitely need to use a 64-bit encoding, and thus will use a VOP3...
static unsigned inverseMinMax(unsigned Opc)
static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, SelectionDAG &DAG)
static unsigned workitemIntrinsicDim(unsigned ID)
static int getOrCreateFixedStackObject(MachineFrameInfo &MFI, unsigned Size, int64_t Offset)
static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset, uint32_t Width, const SDLoc &DL)
static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X, SDValue Y, SDValue C, SDNodeFlags Flags=SDNodeFlags())
static SDValue getAddOneOp(const SDNode *V)
If V is an add of a constant 1, returns the other operand.
#define NODE_NAME_CASE(node)
static LLVM_READONLY bool selectSupportsSourceMods(const SDNode *N)
Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the type for ISD::SELECT.
static cl::opt< bool > AMDGPUBypassSlowDiv("amdgpu-bypass-slow-div", cl::desc("Skip 64-bit divide for dynamic 32-bit values"), cl::init(true))
static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL, SDValue N0, SDValue N1, unsigned Size, bool Signed)
static bool fnegFoldsIntoOp(const SDNode *N)
static bool isI24(SDValue Op, SelectionDAG &DAG)
static bool isCttzOpc(unsigned Opc)
static bool isU24(SDValue Op, SelectionDAG &DAG)
static SDValue peekFPSignOps(SDValue Val)
static bool valueIsKnownNeverF32Denorm(SDValue Src)
Return true if it's known that Src can never be an f32 denormal value.
static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI, unsigned Op, const SDLoc &SL, SDValue Cond, SDValue N1, SDValue N2)
static SDValue peekFNeg(SDValue Val)
static SDValue simplifyMul24(SDNode *Node24, TargetLowering::DAGCombinerInfo &DCI)
static bool isCtlzOpc(unsigned Opc)
static LLVM_READNONE bool fnegFoldsIntoOpcode(unsigned Opc)
static bool hasVolatileUser(SDNode *Val)
Interface definition of the TargetLowering class that is common to all AMD GPUs.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
AMDGPU promote alloca to vector or LDS
Function Alias Analysis Results
block Block Frequency Analysis
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define LLVM_READNONE
Definition: Compiler.h:220
#define LLVM_READONLY
Definition: Compiler.h:227
static cl::opt< unsigned > CostThreshold("dfa-cost-threshold", cl::desc("Maximum cost accepted for the transformation"), cl::Hidden, cl::init(50))
static Error getAddrSpace(StringRef R, unsigned &AddrSpace)
Definition: DataLayout.cpp:266
uint64_t Size
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
Provides analysis for querying information about KnownBits during GISel passes.
IRTranslator LLVM IR MI
static LVOptions Options
Definition: LVOptions.cpp:25
#define I(x, y, z)
Definition: MD5.cpp:58
#define G(x, y, z)
Definition: MD5.cpp:56
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first found DebugLoc that has a DILocation, given a range of instructions.
LLVMContext & Context
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
const char LLVMTargetMachineRef TM
const SmallVectorImpl< MachineOperand > & Cond
#define CH(x, y, z)
Definition: SHA256.cpp:34
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool Enabled
Definition: Statistic.cpp:46
Value * RHS
Value * LHS
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static bool isUniformMMO(const MachineMemOperand *MMO)
static std::optional< uint32_t > getLDSAbsoluteAddress(const GlobalValue &GV)
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV)
bool hasFminFmaxLegacy() const
Align getAlignmentForImplicitArgPtr() const
bool hasMadMacF32Insts() const
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
bool has16BitInsts() const
bool hasFastFMAF32() const
unsigned getExplicitKernelArgOffset() const
Returns the offset in bytes from the start of the input buffer of the first explicit kernel argument.
static const AMDGPUSubtarget & get(const MachineFunction &MF)
bool hasInv2PiInlineImm() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue combineFMinMaxLegacy(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True, SDValue False, SDValue CC, DAGCombinerInfo &DCI) const
Generate Min/Max node.
unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
This method can be implemented by targets that want to expose additional information about sign bits ...
SDValue performMulhuCombine(SDNode *N, DAGCombinerInfo &DCI) const
EVT getTypeForExtReturn(LLVMContext &Context, EVT VT, ISD::NodeType ExtendKind) const override
Return the type that should be used to zero or sign extend a zeroext/signext integer return value.
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const
SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
bool storeOfVectorConstantIsCheap(bool IsZero, EVT MemVT, unsigned NumElem, unsigned AS) const override
Return true if it is expected to be cheaper to do a store of vector constant with the given size and ...
SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool shouldCombineMemoryType(EVT VT) const
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue performAssertSZExtCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isTruncateFree(EVT Src, EVT Dest) const override
bool aggressivelyPreferBuildVectorSources(EVT VecVT) const override
SDValue LowerFCEIL(SDValue Op, SelectionDAG &DAG) const
TargetLowering::NegatibleCost getConstantNegateCost(const ConstantFPSDNode *C) const
SDValue LowerFLOGUnsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, bool IsLog10, SDNodeFlags Flags) const
SDValue performMulhsCombine(SDNode *N, DAGCombinerInfo &DCI) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool isSDNodeAlwaysUniform(const SDNode *N) const override
bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to move this shift by a constant amount through its operand,...
SDValue LowerFREM(SDValue Op, SelectionDAG &DAG) const
Split a vector store into multiple scalar stores.
SDValue performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isCheapToSpeculateCtlz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic ctlz.
SDValue LowerSDIVREM(SDValue Op, SelectionDAG &DAG) const
bool isFNegFree(EVT VT) const override
Return true if an fneg operation is free to the point where it is never worthwhile to replace it with...
SDValue LowerFLOG10(SDValue Op, SelectionDAG &DAG) const
SDValue LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, bool Signed) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
bool isConstantCheaperToNegate(SDValue N) const
bool isReassocProfitable(MachineRegisterInfo &MRI, Register N0, Register N1) const override
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const
SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue performFNegCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
bool isConstantCostlierToNegate(SDValue N) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
SDValue LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const
SDValue lowerFEXP10Unsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, SDNodeFlags Flags) const
Emit approx-funcs appropriate lowering for exp10.
SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const
bool isCheapToSpeculateCttz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic cttz.
SDValue performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond, SDValue LHS, SDValue RHS, DAGCombinerInfo &DCI) const
SDValue performSraCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isSelectSupported(SelectSupportKind) const override
bool isZExtFree(Type *Src, Type *Dest) const override
Return true if any actual instruction that defines a value of type FromTy implicitly zero-extends the...
SDValue lowerFEXP2(SDValue Op, SelectionDAG &DAG) const
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
SDValue performSrlCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue lowerFEXP(SDValue Op, SelectionDAG &DAG) const
SDValue getIsLtSmallestNormal(SelectionDAG &DAG, SDValue Op, SDNodeFlags Flags) const
bool mayIgnoreSignedZero(SDValue Op) const
SDValue getIsFinite(SelectionDAG &DAG, SDValue Op, SDNodeFlags Flags) const
bool isLoadBitCastBeneficial(EVT, EVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const final
Return true if the following transform is beneficial: fold (conv (load x)) -> (load (conv*)x) On arch...
bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtType, EVT ExtVT) const override
Return true if it is profitable to reduce a load to a smaller type.
MVT getVectorIdxTy(const DataLayout &) const override
Returns the type to be used for the index operand of: ISD::INSERT_VECTOR_ELT, ISD::EXTRACT_VECTOR_ELT...
std::pair< SDValue, SDValue > splitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HighVT, SelectionDAG &DAG) const
Split a vector value into two parts of types LoVT and HiVT.
SDValue LowerFLOGCommon(SDValue Op, SelectionDAG &DAG) const
SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI, SDValue N) const
bool shouldSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Whether it is profitable to sink the operands of an Instruction I to the basic block of I.
SDValue LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG, bool Signed) const
bool isFAbsFree(EVT VT) const override
Return true if an fabs operation is free to the point where it is never worthwhile to replace it with...
SDValue loadStackInputValue(SelectionDAG &DAG, EVT VT, const SDLoc &SL, int64_t Offset) const
Similar to CreateLiveInRegister, except value maybe loaded from a stack slot rather than passed in a ...
bool isNarrowingProfitable(EVT SrcVT, EVT DestVT) const override
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
SDValue LowerFLOG2(SDValue Op, SelectionDAG &DAG) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &RefinementSteps, bool &UseOneConstNR, bool Reciprocal) const override
Hooks for building estimates in place of slower divisions and square roots.
unsigned computeNumSignBitsForTargetInstr(GISelKnownBits &Analysis, Register R, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
This method can be implemented by targets that want to expose additional information about sign bits ...
SDValue performTruncateCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const
static SDValue stripBitcast(SDValue Val)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
SDValue LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const
SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize, NegatibleCost &Cost, unsigned Depth) const override
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &RefinementSteps) const override
Return a reciprocal estimate value for the input operand.
AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI)
SDValue LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const
const char * getTargetNodeName(unsigned Opcode) const override
This method returns the name of a target specific DAG node.
SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
std::pair< SDValue, SDValue > getScaledLogInput(SelectionDAG &DAG, const SDLoc SL, SDValue Op, SDNodeFlags Flags) const
If denormal handling is required return the scaled input to FLOG2, and the check for denormal range.
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold=4)
SDValue LowerFROUNDEVEN(SDValue Op, SelectionDAG &DAG) const
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override
Returns true if the target can instruction select the specified FP immediate natively.
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
SDValue lowerFEXPUnsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, SDNodeFlags Flags) const
SDValue LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
bool ShouldShrinkFPConstant(EVT VT) const override
If true, then instruction selection should seek to shrink the FP constant of the specified type to a ...
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue getLoHalf64(SDValue Op, SelectionDAG &DAG) const
SDValue lowerCTLZResults(SDValue Op, SelectionDAG &DAG) const
SDValue performFAbsCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerFP_TO_INT64(SDValue Op, SelectionDAG &DAG, bool Signed) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
SDValue LowerFRINT(SDValue Op, SelectionDAG &DAG) const
SDValue performIntrinsicWOChainCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const
SDValue performMulLoHiCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
void LowerUDIVREM64(SDValue Op, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results) const
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
std::pair< EVT, EVT > getSplitDestVTs(const EVT &VT, SelectionDAG &DAG) const
Split a vector type into two parts.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
SDValue combineFMinMaxLegacyImpl(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True, SDValue False, SDValue CC, DAGCombinerInfo &DCI) const
bool bitwiseIsEqual(const APFloat &RHS) const
Definition: APFloat.h:1260
opStatus add(const APFloat &RHS, roundingMode RM)
Definition: APFloat.h:1042
const fltSemantics & getSemantics() const
Definition: APFloat.h:1303
opStatus multiply(const APFloat &RHS, roundingMode RM)
Definition: APFloat.h:1060
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
Definition: APFloat.h:1026
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition: APFloat.h:966
Class for arbitrary precision integers.
Definition: APInt.h:76
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1491
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition: APInt.h:1370
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition: APInt.h:1364
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition: APInt.h:236
bool ule(const APInt &RHS) const
Unsigned less or equal comparison.
Definition: APInt.h:1128
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition: APInt.h:284
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:274
void setLowBits(unsigned loBits)
Set the bottom loBits bits.
Definition: APInt.h:1367
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:748
@ FAdd
*p = old + v
Definition: Instructions.h:785
@ FSub
*p = old - v
Definition: Instructions.h:788
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
Definition: Instructions.h:796
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
Definition: Instructions.h:792
@ Nand
*p = ~(old & v)
Definition: Instructions.h:770
BinOp getOperation() const
Definition: Instructions.h:845
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
LLVMContext & getContext() const
void addLoc(const CCValAssign &V)
static CCValAssign getCustomMem(unsigned ValNo, MVT ValVT, int64_t Offset, MVT LocVT, LocInfo HTP)
const APFloat & getValueAPF() const
bool isNegative() const
Return true if the value is negative.
uint64_t getZExtValue() const
This class represents an Operation in the Expression.
bool print(raw_ostream &OS, DIDumpOptions DumpOpts, const DWARFExpression *Expr, DWARFUnit *U) const
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
Diagnostic information for unsupported feature in backend.
iterator_range< arg_iterator > args()
Definition: Function.h:842
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:264
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:656
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
Machine Value Type.
static auto integer_fixedlen_vector_valuetypes()
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
static auto integer_valuetypes()
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
int getObjectIndexBegin() const
Return the minimum frame object index.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Representation of each machine instruction.
Definition: MachineInstr.h:69
A description of a memory reference used in the backend.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOInvariant
The memory access always returns the same value (or traps).
Flags getFlags() const
Return the raw flags of the source value,.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
bool isSimple() const
Returns true if the memory operation is neither atomic or volatile.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getChain() const
bool isInvariant() const
EVT getMemoryVT() const
Return the type of the in-memory value.
LLVMContext & getContext() const
Get the global data context.
Definition: Module.h:301
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.h:293
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
iterator_range< use_iterator > uses()
SDNodeFlags getFlags() const
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
op_iterator op_end() const
op_iterator op_begin() const
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getOpcode() const
unsigned getNumOperands() const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
SIModeRegisterDefaults getMode() const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:225
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
unsigned ComputeMaxSignificantBits(SDValue Op, unsigned Depth=0) const
Get the upper bound on bit size for this Value Op as a signed integer.
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
Definition: SelectionDAG.h:551
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS)
Helper function to make it easier to build Select's if you just have operands and don't want to check...
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
Definition: SelectionDAG.h:478
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
Definition: SelectionDAG.h:828
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
Definition: SelectionDAG.h:472
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
SDValue getRegister(unsigned Reg, EVT VT)
bool isConstantValueOfAnyType(SDValue N) const
SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
static const fltSemantics & EVTToAPFloatSemantics(EVT VT)
Returns an APFloat semantics tag appropriate for the given type.
const TargetMachine & getTarget() const
Definition: SelectionDAG.h:473
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond)
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
SDValue getValueType(EVT)
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
bool isKnownNeverNaN(SDValue Op, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:676
unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:469
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, unsigned Reg, EVT VT)
Definition: SelectionDAG.h:799
SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
Definition: SelectionDAG.h:485
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
Definition: SelectionDAG.h:560
SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL, bool LegalTypes=true)
SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition: SelectionDAG.h:554
std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
bool equals(StringRef RHS) const
equals - Check for string equality, this is more efficient than compare() when the relative ordering ...
Definition: StringRef.h:164
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void addBypassSlowDiv(unsigned int SlowBitWidth, unsigned int FastBitWidth)
Tells the code generator which bitwidths to bypass.
void setMaxLargeFPConvertBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum fp convert the backend supports.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
SelectSupportKind
Enum that describes what type of support for selects the target has.
virtual bool allowsMisalignedMemoryAccesses(EVT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *=nullptr) const
Determine if the target supports unaligned memory accesses.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
virtual EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const
Return the ValueType of the result of SETCC operations.
EVT getShiftAmountTy(EVT LHSTy, const DataLayout &DL, bool LegalTypes=true) const
Returns the type for the shift amount of a shift opcode.
virtual EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const
For types supported by the target, this is an identity function.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
void setSupportsUnalignedAtomics(bool UnalignedSupported)
Sets whether unaligned atomic operations are supported.
void setLibcallName(RTLIB::Libcall Call, const char *Name)
Rename the default libcall routine name for the specified libcall.
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
virtual bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT) const
Return true if it is profitable to reduce a load to a smaller type.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
void setMinCmpXchgSizeInBits(unsigned SizeInBits)
Sets the minimum cmpxchg or ll/sc size supported by the backend.
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
unsigned GatherAllAliasesMaxDepth
Depth that GatherAllAliases should continue looking for chain dependencies when trying to find a more...
NegatibleCost
Enum that specifies when a float negation is beneficial.
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
void setHasMultipleConditionRegisters(bool hasManyRegs=true)
Tells the code generator that the target has multiple (allocatable) condition registers that can be u...
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
void setJumpIsExpensive(bool isExpensive=true)
Tells the code generator not to expand logic operations on comparison predicates into separate sequen...
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
bool ShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const
Check to see if the specified operand of the specified instruction is a constant integer.
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
virtual SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOps, bool OptForSize, NegatibleCost &Cost, unsigned Depth=0) const
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:76
TargetOptions Options
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:342
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
bool isIntrinsicAlwaysUniform(unsigned IntrID)
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
Definition: CallingConv.h:197
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
Definition: CallingConv.h:188
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ AMDGPU_Gfx
Used for AMD graphics targets.
Definition: CallingConv.h:232
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:249
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
Definition: CallingConv.h:206
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
Definition: CallingConv.h:191
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:245
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:194
@ Cold
Attempts to make code in the caller as efficient as possible under the assumption that the call is no...
Definition: CallingConv.h:47
@ SPIR_KERNEL
Used for SPIR kernel functions.
Definition: CallingConv.h:144
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
Definition: CallingConv.h:218
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
Definition: CallingConv.h:213
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition: ISDOpcodes.h:40
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:751
@ CTLZ_ZERO_UNDEF
Definition: ISDOpcodes.h:724
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:251
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition: ISDOpcodes.h:560
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:715
@ ConstantFP
Definition: ISDOpcodes.h:77
@ ADDC
Carry-setting nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:270
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition: ISDOpcodes.h:488
@ FMAXNUM_IEEE
Definition: ISDOpcodes.h:986
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:240
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1038
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition: ISDOpcodes.h:784
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:484
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:791
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition: ISDOpcodes.h:544
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:391
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:256
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
Definition: ISDOpcodes.h:914
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:904
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition: ISDOpcodes.h:230
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
Definition: ISDOpcodes.h:940
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:775
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition: ISDOpcodes.h:723
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:931
@ BRIND
BRIND - Indirect branch.
Definition: ISDOpcodes.h:1059
@ BR_JT
BR_JT - Jumptable branch.
Definition: ISDOpcodes.h:1063
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition: ISDOpcodes.h:501
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
Definition: ISDOpcodes.h:508
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:728
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
Definition: ISDOpcodes.h:1244
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition: ISDOpcodes.h:223
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:652
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:706
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:601
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition: ISDOpcodes.h:574
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
Definition: ISDOpcodes.h:985
@ EntryToken
EntryToken - This is the marker used to indicate the start of a region.
Definition: ISDOpcodes.h:47
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:536
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:203
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:781
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition: ISDOpcodes.h:743
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:972
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
Definition: ISDOpcodes.h:1048
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:799
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:675
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:889
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition: ISDOpcodes.h:737
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:304
@ INLINEASM_BR
INLINEASM_BR - Branching version of inline asm. Used by asm-goto.
Definition: ISDOpcodes.h:1104
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:991
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:837
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:681
@ TRAP
TRAP - Trapping instruction.
Definition: ISDOpcodes.h:1215
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:184
@ ADDE
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:280
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition: ISDOpcodes.h:525
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition: ISDOpcodes.h:52
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
Definition: ISDOpcodes.h:945
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:870
@ INLINEASM
INLINEASM - Represents an inline asm block.
Definition: ISDOpcodes.h:1101
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:787
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition: ISDOpcodes.h:61
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition: ISDOpcodes.h:494
@ AssertZext
Definition: ISDOpcodes.h:62
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:192
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition: ISDOpcodes.h:516
bool isNormalStore(const SDNode *N)
Returns true if the specified node is a non-truncating and unindexed store.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1530
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
Definition: ISDOpcodes.h:1510
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
m_Intrinsic_Ty< Opnd0 >::Ty m_FAbs(const Opnd0 &Op0)
Libcall
RTLIB::Libcall enum - This enum defines all of the runtime library calls the backend can emit.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
constexpr double ln2
Definition: MathExtras.h:33
constexpr double ln10
Definition: MathExtras.h:34
constexpr float log2ef
Definition: MathExtras.h:50
constexpr double log2e
Definition: MathExtras.h:35
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Offset
Definition: DWP.cpp:456
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
bool getAlign(const Function &F, unsigned index, unsigned &align)
ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:372
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition: bit.h:281
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:138
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:156
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:143
raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
CombineLevel
Definition: DAGCombine.h:15
@ AfterLegalizeDAG
Definition: DAGCombine.h:19
@ AfterLegalizeTypes
Definition: DAGCombine.h:17
@ Mul
Product of integers.
@ And
Bitwise or logical AND of integers.
@ Add
Sum of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
DWARFExpression::Operation Op
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition: Analysis.cpp:79
ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:191
@ DS_Warning
bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
APFloat neg(APFloat X)
Returns the negated value of the argument.
Definition: APFloat.h:1387
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition: Alignment.h:208
bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
static const fltSemantics & IEEEsingle() LLVM_READNONE
Definition: APFloat.cpp:249
static constexpr roundingMode rmNearestTiesToEven
Definition: APFloat.h:230
static const fltSemantics & IEEEdouble() LLVM_READNONE
Definition: APFloat.cpp:250
static const fltSemantics & IEEEhalf() LLVM_READNONE
Definition: APFloat.cpp:247
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
MCRegister getRegister() const
unsigned getStackOffset() const
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
static constexpr DenormalMode getPreserveSign()
Extended Value Type.
Definition: ValueTypes.h:34
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition: ValueTypes.h:380
EVT getPow2VectorType(LLVMContext &Context) const
Widens the length of the given vector EVT up to the nearest power of 2 and returns that type.
Definition: ValueTypes.h:462
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:136
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition: ValueTypes.h:73
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition: ValueTypes.h:120
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:146
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:358
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition: ValueTypes.h:233
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:370
EVT getHalfSizedIntegerVT(LLVMContext &Context) const
Finds the smallest simple value type that is greater than or equal to half the width of this EVT.
Definition: ValueTypes.h:415
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition: ValueTypes.h:455
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition: ValueTypes.h:397
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:306
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition: ValueTypes.h:64
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition: ValueTypes.h:366
EVT getRoundIntegerType(LLVMContext &Context) const
Rounds the bit-width of the given integer EVT up to the nearest power of two (and at least to eight),...
Definition: ValueTypes.h:404
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:167
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:313
bool bitsGE(EVT VT) const
Return true if this has no less bits than VT.
Definition: ValueTypes.h:282
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:318
bool isExtended() const
Test if the given EVT is extended (as opposed to being simple).
Definition: ValueTypes.h:141
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:326
bool bitsLE(EVT VT) const
Return true if this has no more bits than VT.
Definition: ValueTypes.h:298
bool isNonNegative() const
Returns true if this value is known to be non-negative.
Definition: KnownBits.h:104
unsigned countMinTrailingZeros() const
Returns the minimum number of trailing zero bits.
Definition: KnownBits.h:238
bool isUnknown() const
Returns true if we don't know any bits.
Definition: KnownBits.h:63
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
Definition: KnownBits.h:157
unsigned getBitWidth() const
Get the bit width of this value.
Definition: KnownBits.h:40
void resetAll()
Resets the known state of all bits.
Definition: KnownBits.h:71
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition: KnownBits.h:292
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition: KnownBits.h:244
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition: KnownBits.h:141
bool isStrictlyPositive() const
Returns true if this value is known to be positive.
Definition: KnownBits.h:110
bool isNegative() const
Returns true if this value is known to be negative.
Definition: KnownBits.h:101
unsigned countMaxSignificantBits() const
Returns the maximum number of bits needed to represent all possible signed values with these known bi...
Definition: KnownBits.h:265
This class contains a discriminated union of information about pointers in memory operands,...
bool isDereferenceable(unsigned Size, LLVMContext &C, const DataLayout &DL) const
Return true if memory region [V, V+Offset+Size) is known to be dereferenceable.
static MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
MachinePointerInfo getWithOffset(int64_t O) const
These are IR-level optimization flags that may be propagated to SDNodes.
void setAllowContract(bool b)
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO)
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...