LLVM 19.0.0git
AMDGPUISelLowering.cpp
Go to the documentation of this file.
1//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This is the parent TargetLowering class for hardware code gen
11/// targets.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPUISelLowering.h"
16#include "AMDGPU.h"
17#include "AMDGPUInstrInfo.h"
24#include "llvm/IR/IntrinsicsAMDGPU.h"
29
30using namespace llvm;
31
32#include "AMDGPUGenCallingConv.inc"
33
35 "amdgpu-bypass-slow-div",
36 cl::desc("Skip 64-bit divide for dynamic 32-bit values"),
37 cl::init(true));
38
39// Find a larger type to do a load / store of a vector with.
41 unsigned StoreSize = VT.getStoreSizeInBits();
42 if (StoreSize <= 32)
43 return EVT::getIntegerVT(Ctx, StoreSize);
44
45 assert(StoreSize % 32 == 0 && "Store size not a multiple of 32");
46 return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
47}
48
51}
52
54 // In order for this to be a signed 24-bit value, bit 23, must
55 // be a sign bit.
56 return DAG.ComputeMaxSignificantBits(Op);
57}
58
60 const AMDGPUSubtarget &STI)
61 : TargetLowering(TM), Subtarget(&STI) {
62 // Always lower memset, memcpy, and memmove intrinsics to load/store
63 // instructions, rather then generating calls to memset, mempcy or memmove.
67
68 // Lower floating point store/load to integer store/load to reduce the number
69 // of patterns in tablegen.
71 AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
72
74 AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
75
77 AddPromotedToType(ISD::LOAD, MVT::v3f32, MVT::v3i32);
78
80 AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
81
83 AddPromotedToType(ISD::LOAD, MVT::v5f32, MVT::v5i32);
84
86 AddPromotedToType(ISD::LOAD, MVT::v6f32, MVT::v6i32);
87
89 AddPromotedToType(ISD::LOAD, MVT::v7f32, MVT::v7i32);
90
92 AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
93
95 AddPromotedToType(ISD::LOAD, MVT::v9f32, MVT::v9i32);
96
98 AddPromotedToType(ISD::LOAD, MVT::v10f32, MVT::v10i32);
99
100 setOperationAction(ISD::LOAD, MVT::v11f32, Promote);
101 AddPromotedToType(ISD::LOAD, MVT::v11f32, MVT::v11i32);
102
103 setOperationAction(ISD::LOAD, MVT::v12f32, Promote);
104 AddPromotedToType(ISD::LOAD, MVT::v12f32, MVT::v12i32);
105
106 setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
107 AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
108
109 setOperationAction(ISD::LOAD, MVT::v32f32, Promote);
110 AddPromotedToType(ISD::LOAD, MVT::v32f32, MVT::v32i32);
111
113 AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
114
116 AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);
117
119 AddPromotedToType(ISD::LOAD, MVT::f64, MVT::v2i32);
120
122 AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32);
123
125 AddPromotedToType(ISD::LOAD, MVT::v3i64, MVT::v6i32);
126
128 AddPromotedToType(ISD::LOAD, MVT::v4i64, MVT::v8i32);
129
131 AddPromotedToType(ISD::LOAD, MVT::v3f64, MVT::v6i32);
132
134 AddPromotedToType(ISD::LOAD, MVT::v4f64, MVT::v8i32);
135
137 AddPromotedToType(ISD::LOAD, MVT::v8i64, MVT::v16i32);
138
140 AddPromotedToType(ISD::LOAD, MVT::v8f64, MVT::v16i32);
141
142 setOperationAction(ISD::LOAD, MVT::v16i64, Promote);
143 AddPromotedToType(ISD::LOAD, MVT::v16i64, MVT::v32i32);
144
145 setOperationAction(ISD::LOAD, MVT::v16f64, Promote);
146 AddPromotedToType(ISD::LOAD, MVT::v16f64, MVT::v32i32);
147
149 AddPromotedToType(ISD::LOAD, MVT::i128, MVT::v4i32);
150
151 // TODO: Would be better to consume as directly legal
153 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f32, MVT::i32);
154
156 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f64, MVT::i64);
157
159 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f16, MVT::i16);
160
162 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::bf16, MVT::i16);
163
164 // There are no 64-bit extloads. These should be done as a 32-bit extload and
165 // an extension to 64-bit.
166 for (MVT VT : MVT::integer_valuetypes())
168 Expand);
169
170 for (MVT VT : MVT::integer_valuetypes()) {
171 if (VT == MVT::i64)
172 continue;
173
174 for (auto Op : {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}) {
175 setLoadExtAction(Op, VT, MVT::i1, Promote);
176 setLoadExtAction(Op, VT, MVT::i8, Legal);
177 setLoadExtAction(Op, VT, MVT::i16, Legal);
178 setLoadExtAction(Op, VT, MVT::i32, Expand);
179 }
180 }
181
183 for (auto MemVT :
184 {MVT::v2i8, MVT::v4i8, MVT::v2i16, MVT::v3i16, MVT::v4i16})
186 Expand);
187
188 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
189 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand);
190 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
191 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2bf16, Expand);
192 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3f16, Expand);
193 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3bf16, Expand);
194 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
195 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4bf16, Expand);
196 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
197 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8bf16, Expand);
198 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Expand);
199 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16bf16, Expand);
200 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32f16, Expand);
201 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32bf16, Expand);
202
203 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
204 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
205 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f32, Expand);
206 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
207 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand);
208 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f32, Expand);
209
210 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
211 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand);
212 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
213 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2bf16, Expand);
214 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f16, Expand);
215 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3bf16, Expand);
216 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
217 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4bf16, Expand);
218 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
219 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8bf16, Expand);
220 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f16, Expand);
221 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16bf16, Expand);
222
224 AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
225
227 AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
228
230 AddPromotedToType(ISD::STORE, MVT::v3f32, MVT::v3i32);
231
233 AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
234
236 AddPromotedToType(ISD::STORE, MVT::v5f32, MVT::v5i32);
237
239 AddPromotedToType(ISD::STORE, MVT::v6f32, MVT::v6i32);
240
242 AddPromotedToType(ISD::STORE, MVT::v7f32, MVT::v7i32);
243
245 AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
246
248 AddPromotedToType(ISD::STORE, MVT::v9f32, MVT::v9i32);
249
251 AddPromotedToType(ISD::STORE, MVT::v10f32, MVT::v10i32);
252
254 AddPromotedToType(ISD::STORE, MVT::v11f32, MVT::v11i32);
255
257 AddPromotedToType(ISD::STORE, MVT::v12f32, MVT::v12i32);
258
260 AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
261
263 AddPromotedToType(ISD::STORE, MVT::v32f32, MVT::v32i32);
264
266 AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
267
269 AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);
270
272 AddPromotedToType(ISD::STORE, MVT::f64, MVT::v2i32);
273
275 AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
276
278 AddPromotedToType(ISD::STORE, MVT::v3i64, MVT::v6i32);
279
281 AddPromotedToType(ISD::STORE, MVT::v3f64, MVT::v6i32);
282
284 AddPromotedToType(ISD::STORE, MVT::v4i64, MVT::v8i32);
285
287 AddPromotedToType(ISD::STORE, MVT::v4f64, MVT::v8i32);
288
290 AddPromotedToType(ISD::STORE, MVT::v8i64, MVT::v16i32);
291
293 AddPromotedToType(ISD::STORE, MVT::v8f64, MVT::v16i32);
294
296 AddPromotedToType(ISD::STORE, MVT::v16i64, MVT::v32i32);
297
299 AddPromotedToType(ISD::STORE, MVT::v16f64, MVT::v32i32);
300
302 AddPromotedToType(ISD::STORE, MVT::i128, MVT::v4i32);
303
304 setTruncStoreAction(MVT::i64, MVT::i1, Expand);
305 setTruncStoreAction(MVT::i64, MVT::i8, Expand);
306 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
307 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
308
309 setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
310 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Expand);
311 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand);
312 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
313
314 setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
315 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
316 setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
317 setTruncStoreAction(MVT::v3f32, MVT::v3f16, Expand);
318 setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
319 setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
320 setTruncStoreAction(MVT::v16f32, MVT::v16f16, Expand);
321 setTruncStoreAction(MVT::v32f32, MVT::v32f16, Expand);
322
323 setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
324 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
325 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
326
327 setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
328 setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
329
330 setTruncStoreAction(MVT::v3i32, MVT::v3i8, Expand);
331
332 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
333 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
334 setTruncStoreAction(MVT::v3i64, MVT::v3i8, Expand);
335 setTruncStoreAction(MVT::v3i64, MVT::v3i1, Expand);
336 setTruncStoreAction(MVT::v3f64, MVT::v3f32, Expand);
337 setTruncStoreAction(MVT::v3f64, MVT::v3f16, Expand);
338
339 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Expand);
340 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Expand);
341 setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
342 setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);
343
344 setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);
345 setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);
346
347 setTruncStoreAction(MVT::v16f64, MVT::v16f32, Expand);
348 setTruncStoreAction(MVT::v16f64, MVT::v16f16, Expand);
349 setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
350 setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
351 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
352 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
353 setTruncStoreAction(MVT::v16i64, MVT::v16i1, Expand);
354
355 setOperationAction(ISD::Constant, {MVT::i32, MVT::i64}, Legal);
356 setOperationAction(ISD::ConstantFP, {MVT::f32, MVT::f64}, Legal);
357
359
360 // For R600, this is totally unsupported, just custom lower to produce an
361 // error.
363
364 // Library functions. These default to Expand, but we have instructions
365 // for them.
368 MVT::f32, Legal);
369
371 setOperationAction(ISD::FROUND, {MVT::f32, MVT::f64}, Custom);
372
375 Custom);
376
377 setOperationAction(ISD::FNEARBYINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);
378
379 setOperationAction(ISD::FRINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);
380
381 setOperationAction(ISD::FREM, {MVT::f16, MVT::f32, MVT::f64}, Custom);
382
383 if (Subtarget->has16BitInsts())
384 setOperationAction(ISD::IS_FPCLASS, {MVT::f16, MVT::f32, MVT::f64}, Legal);
385 else {
386 setOperationAction(ISD::IS_FPCLASS, {MVT::f32, MVT::f64}, Legal);
388 }
389
391 Custom);
392
393 // FIXME: These IS_FPCLASS vector fp types are marked custom so it reaches
394 // scalarization code. Can be removed when IS_FPCLASS expand isn't called by
395 // default unless marked custom/legal.
398 {MVT::v2f16, MVT::v3f16, MVT::v4f16, MVT::v16f16, MVT::v2f32, MVT::v3f32,
399 MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v16f32,
400 MVT::v2f64, MVT::v3f64, MVT::v4f64, MVT::v8f64, MVT::v16f64},
401 Custom);
402
403 // Expand to fneg + fadd.
405
407 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32,
408 MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
409 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
410 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
411 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
412 Custom);
413
414 // FIXME: Why is v8f16/v8bf16 missing?
417 {MVT::v2f16, MVT::v2bf16, MVT::v2i16, MVT::v4f16, MVT::v4bf16,
418 MVT::v4i16, MVT::v2f32, MVT::v2i32, MVT::v3f32, MVT::v3i32,
419 MVT::v4f32, MVT::v4i32, MVT::v5f32, MVT::v5i32, MVT::v6f32,
420 MVT::v6i32, MVT::v7f32, MVT::v7i32, MVT::v8f32, MVT::v8i32,
421 MVT::v9f32, MVT::v9i32, MVT::v10i32, MVT::v10f32, MVT::v11i32,
422 MVT::v11f32, MVT::v12i32, MVT::v12f32, MVT::v16f16, MVT::v16bf16,
423 MVT::v16i16, MVT::v16f32, MVT::v16i32, MVT::v32f32, MVT::v32i32,
424 MVT::v2f64, MVT::v2i64, MVT::v3f64, MVT::v3i64, MVT::v4f64,
425 MVT::v4i64, MVT::v8f64, MVT::v8i64, MVT::v16f64, MVT::v16i64,
426 MVT::v32i16, MVT::v32f16, MVT::v32bf16},
427 Custom);
428
430 setOperationAction(ISD::FP_TO_FP16, {MVT::f64, MVT::f32}, Custom);
431
432 const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
433 for (MVT VT : ScalarIntVTs) {
434 // These should use [SU]DIVREM, so set them to expand
436 Expand);
437
438 // GPU does not have divrem function for signed or unsigned.
440
441 // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
443
445
446 // AMDGPU uses ADDC/SUBC/ADDE/SUBE
448 }
449
450 // The hardware supports 32-bit FSHR, but not FSHL.
452
453 // The hardware supports 32-bit ROTR, but not ROTL.
454 setOperationAction(ISD::ROTL, {MVT::i32, MVT::i64}, Expand);
456
458
462 MVT::i64, Custom);
464
466 Legal);
467
470 MVT::i64, Custom);
471
472 for (auto VT : {MVT::i8, MVT::i16})
474
475 static const MVT::SimpleValueType VectorIntTypes[] = {
476 MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32,
477 MVT::v9i32, MVT::v10i32, MVT::v11i32, MVT::v12i32};
478
479 for (MVT VT : VectorIntTypes) {
480 // Expand the following operations for the current type by default.
492 ISD::SETCC},
493 VT, Expand);
494 }
495
496 static const MVT::SimpleValueType FloatVectorTypes[] = {
497 MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32,
498 MVT::v9f32, MVT::v10f32, MVT::v11f32, MVT::v12f32};
499
500 for (MVT VT : FloatVectorTypes) {
513 VT, Expand);
514 }
515
516 // This causes using an unrolled select operation rather than expansion with
517 // bit operations. This is in general better, but the alternative using BFI
518 // instructions may be better if the select sources are SGPRs.
520 AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32);
521
523 AddPromotedToType(ISD::SELECT, MVT::v3f32, MVT::v3i32);
524
526 AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32);
527
529 AddPromotedToType(ISD::SELECT, MVT::v5f32, MVT::v5i32);
530
532 AddPromotedToType(ISD::SELECT, MVT::v6f32, MVT::v6i32);
533
535 AddPromotedToType(ISD::SELECT, MVT::v7f32, MVT::v7i32);
536
538 AddPromotedToType(ISD::SELECT, MVT::v9f32, MVT::v9i32);
539
541 AddPromotedToType(ISD::SELECT, MVT::v10f32, MVT::v10i32);
542
544 AddPromotedToType(ISD::SELECT, MVT::v11f32, MVT::v11i32);
545
547 AddPromotedToType(ISD::SELECT, MVT::v12f32, MVT::v12i32);
548
549 // Disable most libcalls.
550 for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I) {
551 if (I < RTLIB::ATOMIC_LOAD || I > RTLIB::ATOMIC_FETCH_NAND_16)
552 setLibcallName(static_cast<RTLIB::Libcall>(I), nullptr);
553 }
554
556 setJumpIsExpensive(true);
557
558 // FIXME: This is only partially true. If we have to do vector compares, any
559 // SGPR pair can be a condition register. If we have a uniform condition, we
560 // are better off doing SALU operations, where there is only one SCC. For now,
561 // we don't have a way of knowing during instruction selection if a condition
562 // will be uniform and we always use vector compares. Assume we are using
563 // vector compares until that is fixed.
565
568
570
571 // We want to find all load dependencies for long chains of stores to enable
572 // merging into very wide vectors. The problem is with vectors with > 4
573 // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
574 // vectors are a legal type, even though we have to split the loads
575 // usually. When we can more precisely specify load legality per address
576 // space, we should be able to make FindBetterChain/MergeConsecutiveStores
577 // smarter so that they can figure out what to do in 2 iterations without all
578 // N > 4 stores on the same chain.
580
581 // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
582 // about these during lowering.
583 MaxStoresPerMemcpy = 0xffffffff;
584 MaxStoresPerMemmove = 0xffffffff;
585 MaxStoresPerMemset = 0xffffffff;
586
587 // The expansion for 64-bit division is enormous.
589 addBypassSlowDiv(64, 32);
590
601
605}
606
608 if (getTargetMachine().Options.NoSignedZerosFPMath)
609 return true;
610
611 const auto Flags = Op.getNode()->getFlags();
612 if (Flags.hasNoSignedZeros())
613 return true;
614
615 return false;
616}
617
618//===----------------------------------------------------------------------===//
619// Target Information
620//===----------------------------------------------------------------------===//
621
623static bool fnegFoldsIntoOpcode(unsigned Opc) {
624 switch (Opc) {
625 case ISD::FADD:
626 case ISD::FSUB:
627 case ISD::FMUL:
628 case ISD::FMA:
629 case ISD::FMAD:
630 case ISD::FMINNUM:
631 case ISD::FMAXNUM:
634 case ISD::FMINIMUM:
635 case ISD::FMAXIMUM:
636 case ISD::SELECT:
637 case ISD::FSIN:
638 case ISD::FTRUNC:
639 case ISD::FRINT:
640 case ISD::FNEARBYINT:
641 case ISD::FROUNDEVEN:
643 case AMDGPUISD::RCP:
650 case AMDGPUISD::FMED3:
651 // TODO: handle llvm.amdgcn.fma.legacy
652 return true;
653 case ISD::BITCAST:
654 llvm_unreachable("bitcast is special cased");
655 default:
656 return false;
657 }
658}
659
660static bool fnegFoldsIntoOp(const SDNode *N) {
661 unsigned Opc = N->getOpcode();
662 if (Opc == ISD::BITCAST) {
663 // TODO: Is there a benefit to checking the conditions performFNegCombine
664 // does? We don't for the other cases.
665 SDValue BCSrc = N->getOperand(0);
666 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
667 return BCSrc.getNumOperands() == 2 &&
668 BCSrc.getOperand(1).getValueSizeInBits() == 32;
669 }
670
671 return BCSrc.getOpcode() == ISD::SELECT && BCSrc.getValueType() == MVT::f32;
672 }
673
674 return fnegFoldsIntoOpcode(Opc);
675}
676
677/// \p returns true if the operation will definitely need to use a 64-bit
678/// encoding, and thus will use a VOP3 encoding regardless of the source
679/// modifiers.
681static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
682 return (N->getNumOperands() > 2 && N->getOpcode() != ISD::SELECT) ||
683 VT == MVT::f64;
684}
685
686/// Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the
687/// type for ISD::SELECT.
689static bool selectSupportsSourceMods(const SDNode *N) {
690 // TODO: Only applies if select will be vector
691 return N->getValueType(0) == MVT::f32;
692}
693
694// Most FP instructions support source modifiers, but this could be refined
695// slightly.
697static bool hasSourceMods(const SDNode *N) {
698 if (isa<MemSDNode>(N))
699 return false;
700
701 switch (N->getOpcode()) {
702 case ISD::CopyToReg:
703 case ISD::FDIV:
704 case ISD::FREM:
705 case ISD::INLINEASM:
709
710 // TODO: Should really be looking at the users of the bitcast. These are
711 // problematic because bitcasts are used to legalize all stores to integer
712 // types.
713 case ISD::BITCAST:
714 return false;
716 switch (N->getConstantOperandVal(0)) {
717 case Intrinsic::amdgcn_interp_p1:
718 case Intrinsic::amdgcn_interp_p2:
719 case Intrinsic::amdgcn_interp_mov:
720 case Intrinsic::amdgcn_interp_p1_f16:
721 case Intrinsic::amdgcn_interp_p2_f16:
722 return false;
723 default:
724 return true;
725 }
726 }
727 case ISD::SELECT:
729 default:
730 return true;
731 }
732}
733
735 unsigned CostThreshold) {
736 // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
737 // it is truly free to use a source modifier in all cases. If there are
738 // multiple users but for each one will necessitate using VOP3, there will be
739 // a code size increase. Try to avoid increasing code size unless we know it
740 // will save on the instruction count.
741 unsigned NumMayIncreaseSize = 0;
742 MVT VT = N->getValueType(0).getScalarType().getSimpleVT();
743
744 assert(!N->use_empty());
745
746 // XXX - Should this limit number of uses to check?
747 for (const SDNode *U : N->uses()) {
748 if (!hasSourceMods(U))
749 return false;
750
751 if (!opMustUseVOP3Encoding(U, VT)) {
752 if (++NumMayIncreaseSize > CostThreshold)
753 return false;
754 }
755 }
756
757 return true;
758}
759
761 ISD::NodeType ExtendKind) const {
762 assert(!VT.isVector() && "only scalar expected");
763
764 // Round to the next multiple of 32-bits.
765 unsigned Size = VT.getSizeInBits();
766 if (Size <= 32)
767 return MVT::i32;
768 return EVT::getIntegerVT(Context, 32 * ((Size + 31) / 32));
769}
770
772 return MVT::i32;
773}
774
776 return true;
777}
778
779// The backend supports 32 and 64 bit floating point immediates.
780// FIXME: Why are we reporting vectors of FP immediates as legal?
782 bool ForCodeSize) const {
783 EVT ScalarVT = VT.getScalarType();
784 return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 ||
785 (ScalarVT == MVT::f16 && Subtarget->has16BitInsts()));
786}
787
788// We don't want to shrink f64 / f32 constants.
790 EVT ScalarVT = VT.getScalarType();
791 return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
792}
793
795 ISD::LoadExtType ExtTy,
796 EVT NewVT) const {
797 // TODO: This may be worth removing. Check regression tests for diffs.
799 return false;
800
801 unsigned NewSize = NewVT.getStoreSizeInBits();
802
803 // If we are reducing to a 32-bit load or a smaller multi-dword load,
804 // this is always better.
805 if (NewSize >= 32)
806 return true;
807
808 EVT OldVT = N->getValueType(0);
809 unsigned OldSize = OldVT.getStoreSizeInBits();
810
811 MemSDNode *MN = cast<MemSDNode>(N);
812 unsigned AS = MN->getAddressSpace();
813 // Do not shrink an aligned scalar load to sub-dword.
814 // Scalar engine cannot do sub-dword loads.
815 // TODO: Update this for GFX12 which does have scalar sub-dword loads.
816 if (OldSize >= 32 && NewSize < 32 && MN->getAlign() >= Align(4) &&
819 (isa<LoadSDNode>(N) && AS == AMDGPUAS::GLOBAL_ADDRESS &&
820 MN->isInvariant())) &&
822 return false;
823
824 // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
825 // extloads, so doing one requires using a buffer_load. In cases where we
826 // still couldn't use a scalar load, using the wider load shouldn't really
827 // hurt anything.
828
829 // If the old size already had to be an extload, there's no harm in continuing
830 // to reduce the width.
831 return (OldSize < 32);
832}
833
835 const SelectionDAG &DAG,
836 const MachineMemOperand &MMO) const {
837
838 assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());
839
840 if (LoadTy.getScalarType() == MVT::i32)
841 return false;
842
843 unsigned LScalarSize = LoadTy.getScalarSizeInBits();
844 unsigned CastScalarSize = CastTy.getScalarSizeInBits();
845
846 if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32))
847 return false;
848
849 unsigned Fast = 0;
851 CastTy, MMO, &Fast) &&
852 Fast;
853}
854
855// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
856// profitable with the expansion for 64-bit since it's generally good to
857// speculate things.
859 return true;
860}
861
863 return true;
864}
865
867 switch (N->getOpcode()) {
868 case ISD::EntryToken:
869 case ISD::TokenFactor:
870 return true;
872 unsigned IntrID = N->getConstantOperandVal(0);
874 }
875 case ISD::LOAD:
876 if (cast<LoadSDNode>(N)->getMemOperand()->getAddrSpace() ==
878 return true;
879 return false;
880 case AMDGPUISD::SETCC: // ballot-style instruction
881 return true;
882 }
883 return false;
884}
885
887 SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize,
888 NegatibleCost &Cost, unsigned Depth) const {
889
890 switch (Op.getOpcode()) {
891 case ISD::FMA:
892 case ISD::FMAD: {
893 // Negating a fma is not free if it has users without source mods.
894 if (!allUsesHaveSourceMods(Op.getNode()))
895 return SDValue();
896 break;
897 }
898 case AMDGPUISD::RCP: {
899 SDValue Src = Op.getOperand(0);
900 EVT VT = Op.getValueType();
901 SDLoc SL(Op);
902
903 SDValue NegSrc = getNegatedExpression(Src, DAG, LegalOperations,
904 ForCodeSize, Cost, Depth + 1);
905 if (NegSrc)
906 return DAG.getNode(AMDGPUISD::RCP, SL, VT, NegSrc, Op->getFlags());
907 return SDValue();
908 }
909 default:
910 break;
911 }
912
913 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
914 ForCodeSize, Cost, Depth);
915}
916
917//===---------------------------------------------------------------------===//
918// Target Properties
919//===---------------------------------------------------------------------===//
920
923
924 // Packed operations do not have a fabs modifier.
925 return VT == MVT::f32 || VT == MVT::f64 ||
926 (Subtarget->has16BitInsts() && VT == MVT::f16);
927}
928
931 // Report this based on the end legalized type.
932 VT = VT.getScalarType();
933 return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16;
934}
935
937 unsigned NumElem,
938 unsigned AS) const {
939 return true;
940}
941
943 // There are few operations which truly have vector input operands. Any vector
944 // operation is going to involve operations on each component, and a
945 // build_vector will be a copy per element, so it always makes sense to use a
946 // build_vector input in place of the extracted element to avoid a copy into a
947 // super register.
948 //
949 // We should probably only do this if all users are extracts only, but this
950 // should be the common case.
951 return true;
952}
953
955 // Truncate is just accessing a subregister.
956
957 unsigned SrcSize = Source.getSizeInBits();
958 unsigned DestSize = Dest.getSizeInBits();
959
960 return DestSize < SrcSize && DestSize % 32 == 0 ;
961}
962
964 // Truncate is just accessing a subregister.
965
966 unsigned SrcSize = Source->getScalarSizeInBits();
967 unsigned DestSize = Dest->getScalarSizeInBits();
968
969 if (DestSize== 16 && Subtarget->has16BitInsts())
970 return SrcSize >= 32;
971
972 return DestSize < SrcSize && DestSize % 32 == 0;
973}
974
976 unsigned SrcSize = Src->getScalarSizeInBits();
977 unsigned DestSize = Dest->getScalarSizeInBits();
978
979 if (SrcSize == 16 && Subtarget->has16BitInsts())
980 return DestSize >= 32;
981
982 return SrcSize == 32 && DestSize == 64;
983}
984
986 // Any register load of a 64-bit value really requires 2 32-bit moves. For all
987 // practical purposes, the extra mov 0 to load a 64-bit is free. As used,
988 // this will enable reducing 64-bit operations the 32-bit, which is always
989 // good.
990
991 if (Src == MVT::i16)
992 return Dest == MVT::i32 ||Dest == MVT::i64 ;
993
994 return Src == MVT::i32 && Dest == MVT::i64;
995}
996
998 // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
999 // limited number of native 64-bit operations. Shrinking an operation to fit
1000 // in a single 32-bit register should always be helpful. As currently used,
1001 // this is much less general than the name suggests, and is only used in
1002 // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
1003 // not profitable, and may actually be harmful.
1004 return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
1005}
1006
1008 const SDNode* N, CombineLevel Level) const {
1009 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
1010 N->getOpcode() == ISD::SRL) &&
1011 "Expected shift op");
1012 // Always commute pre-type legalization and right shifts.
1013 // We're looking for shl(or(x,y),z) patterns.
1015 N->getOpcode() != ISD::SHL || N->getOperand(0).getOpcode() != ISD::OR)
1016 return true;
1017
1018 // If only user is a i32 right-shift, then don't destroy a BFE pattern.
1019 if (N->getValueType(0) == MVT::i32 && N->use_size() == 1 &&
1020 (N->use_begin()->getOpcode() == ISD::SRA ||
1021 N->use_begin()->getOpcode() == ISD::SRL))
1022 return false;
1023
1024 // Don't destroy or(shl(load_zext(),c), load_zext()) patterns.
1025 auto IsShiftAndLoad = [](SDValue LHS, SDValue RHS) {
1026 if (LHS.getOpcode() != ISD::SHL)
1027 return false;
1028 auto *RHSLd = dyn_cast<LoadSDNode>(RHS);
1029 auto *LHS0 = dyn_cast<LoadSDNode>(LHS.getOperand(0));
1030 auto *LHS1 = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
1031 return LHS0 && LHS1 && RHSLd && LHS0->getExtensionType() == ISD::ZEXTLOAD &&
1032 LHS1->getAPIntValue() == LHS0->getMemoryVT().getScalarSizeInBits() &&
1033 RHSLd->getExtensionType() == ISD::ZEXTLOAD;
1034 };
1035 SDValue LHS = N->getOperand(0).getOperand(0);
1036 SDValue RHS = N->getOperand(0).getOperand(1);
1037 return !(IsShiftAndLoad(LHS, RHS) || IsShiftAndLoad(RHS, LHS));
1038}
1039
1040//===---------------------------------------------------------------------===//
1041// TargetLowering Callbacks
1042//===---------------------------------------------------------------------===//
1043
1045 bool IsVarArg) {
1046 switch (CC) {
1054 return CC_AMDGPU;
1057 return CC_AMDGPU_CS_CHAIN;
1058 case CallingConv::C:
1059 case CallingConv::Fast:
1060 case CallingConv::Cold:
1061 return CC_AMDGPU_Func;
1063 return CC_SI_Gfx;
1066 default:
1067 report_fatal_error("Unsupported calling convention for call");
1068 }
1069}
1070
1072 bool IsVarArg) {
1073 switch (CC) {
1076 llvm_unreachable("kernels should not be handled here");
1086 return RetCC_SI_Shader;
1088 return RetCC_SI_Gfx;
1089 case CallingConv::C:
1090 case CallingConv::Fast:
1091 case CallingConv::Cold:
1092 return RetCC_AMDGPU_Func;
1093 default:
1094 report_fatal_error("Unsupported calling convention.");
1095 }
1096}
1097
1098/// The SelectionDAGBuilder will automatically promote function arguments
1099/// with illegal types. However, this does not work for the AMDGPU targets
1100/// since the function arguments are stored in memory as these illegal types.
1101/// In order to handle this properly we need to get the original types sizes
1102/// from the LLVM IR Function and fixup the ISD:InputArg values before
1103/// passing them to AnalyzeFormalArguments()
1104
1105/// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
1106/// input values across multiple registers. Each item in the Ins array
1107/// represents a single value that will be stored in registers. Ins[x].VT is
1108/// the value type of the value that will be stored in the register, so
1109/// whatever SDNode we lower the argument to needs to be this type.
1110///
1111/// In order to correctly lower the arguments we need to know the size of each
1112/// argument. Since Ins[x].VT gives us the size of the register that will
1113/// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
1114/// for the original function argument so that we can deduce the correct memory
1115/// type to use for Ins[x]. In most cases the correct memory type will be
1116/// Ins[x].ArgVT. However, this will not always be the case. If, for example,
1117/// we have a kernel argument of type v8i8, this argument will be split into
1118/// 8 parts and each part will be represented by its own item in the Ins array.
1119/// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
1120/// the argument before it was split. From this, we deduce that the memory type
1121/// for each individual part is i8. We pass the memory type as LocVT to the
1122/// calling convention analysis function and the register type (Ins[x].VT) as
1123/// the ValVT.
1125 CCState &State,
1126 const SmallVectorImpl<ISD::InputArg> &Ins) const {
1127 const MachineFunction &MF = State.getMachineFunction();
1128 const Function &Fn = MF.getFunction();
1129 LLVMContext &Ctx = Fn.getParent()->getContext();
1130 const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF);
1131 const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset();
1133
1134 Align MaxAlign = Align(1);
1135 uint64_t ExplicitArgOffset = 0;
1136 const DataLayout &DL = Fn.getParent()->getDataLayout();
1137
1138 unsigned InIndex = 0;
1139
1140 for (const Argument &Arg : Fn.args()) {
1141 const bool IsByRef = Arg.hasByRefAttr();
1142 Type *BaseArgTy = Arg.getType();
1143 Type *MemArgTy = IsByRef ? Arg.getParamByRefType() : BaseArgTy;
1144 Align Alignment = DL.getValueOrABITypeAlignment(
1145 IsByRef ? Arg.getParamAlign() : std::nullopt, MemArgTy);
1146 MaxAlign = std::max(Alignment, MaxAlign);
1147 uint64_t AllocSize = DL.getTypeAllocSize(MemArgTy);
1148
1149 uint64_t ArgOffset = alignTo(ExplicitArgOffset, Alignment) + ExplicitOffset;
1150 ExplicitArgOffset = alignTo(ExplicitArgOffset, Alignment) + AllocSize;
1151
1152 // We're basically throwing away everything passed into us and starting over
1153 // to get accurate in-memory offsets. The "PartOffset" is completely useless
1154 // to us as computed in Ins.
1155 //
1156 // We also need to figure out what type legalization is trying to do to get
1157 // the correct memory offsets.
1158
1159 SmallVector<EVT, 16> ValueVTs;
1161 ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, &Offsets, ArgOffset);
1162
1163 for (unsigned Value = 0, NumValues = ValueVTs.size();
1164 Value != NumValues; ++Value) {
1165 uint64_t BasePartOffset = Offsets[Value];
1166
1167 EVT ArgVT = ValueVTs[Value];
1168 EVT MemVT = ArgVT;
1169 MVT RegisterVT = getRegisterTypeForCallingConv(Ctx, CC, ArgVT);
1170 unsigned NumRegs = getNumRegistersForCallingConv(Ctx, CC, ArgVT);
1171
1172 if (NumRegs == 1) {
1173 // This argument is not split, so the IR type is the memory type.
1174 if (ArgVT.isExtended()) {
1175 // We have an extended type, like i24, so we should just use the
1176 // register type.
1177 MemVT = RegisterVT;
1178 } else {
1179 MemVT = ArgVT;
1180 }
1181 } else if (ArgVT.isVector() && RegisterVT.isVector() &&
1182 ArgVT.getScalarType() == RegisterVT.getScalarType()) {
1183 assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements());
1184 // We have a vector value which has been split into a vector with
1185 // the same scalar type, but fewer elements. This should handle
1186 // all the floating-point vector types.
1187 MemVT = RegisterVT;
1188 } else if (ArgVT.isVector() &&
1189 ArgVT.getVectorNumElements() == NumRegs) {
1190 // This arg has been split so that each element is stored in a separate
1191 // register.
1192 MemVT = ArgVT.getScalarType();
1193 } else if (ArgVT.isExtended()) {
1194 // We have an extended type, like i65.
1195 MemVT = RegisterVT;
1196 } else {
1197 unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs;
1198 assert(ArgVT.getStoreSizeInBits() % NumRegs == 0);
1199 if (RegisterVT.isInteger()) {
1200 MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
1201 } else if (RegisterVT.isVector()) {
1202 assert(!RegisterVT.getScalarType().isFloatingPoint());
1203 unsigned NumElements = RegisterVT.getVectorNumElements();
1204 assert(MemoryBits % NumElements == 0);
1205 // This vector type has been split into another vector type with
1206 // a different elements size.
1207 EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
1208 MemoryBits / NumElements);
1209 MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
1210 } else {
1211 llvm_unreachable("cannot deduce memory type.");
1212 }
1213 }
1214
1215 // Convert one element vectors to scalar.
1216 if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
1217 MemVT = MemVT.getScalarType();
1218
1219 // Round up vec3/vec5 argument.
1220 if (MemVT.isVector() && !MemVT.isPow2VectorType()) {
1221 assert(MemVT.getVectorNumElements() == 3 ||
1222 MemVT.getVectorNumElements() == 5 ||
1223 (MemVT.getVectorNumElements() >= 9 &&
1224 MemVT.getVectorNumElements() <= 12));
1225 MemVT = MemVT.getPow2VectorType(State.getContext());
1226 } else if (!MemVT.isSimple() && !MemVT.isVector()) {
1227 MemVT = MemVT.getRoundIntegerType(State.getContext());
1228 }
1229
1230 unsigned PartOffset = 0;
1231 for (unsigned i = 0; i != NumRegs; ++i) {
1232 State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT,
1233 BasePartOffset + PartOffset,
1234 MemVT.getSimpleVT(),
1236 PartOffset += MemVT.getStoreSize();
1237 }
1238 }
1239 }
1240}
1241
1243 SDValue Chain, CallingConv::ID CallConv,
1244 bool isVarArg,
1246 const SmallVectorImpl<SDValue> &OutVals,
1247 const SDLoc &DL, SelectionDAG &DAG) const {
1248 // FIXME: Fails for r600 tests
1249 //assert(!isVarArg && Outs.empty() && OutVals.empty() &&
1250 // "wave terminate should not have return values");
1251 return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
1252}
1253
1254//===---------------------------------------------------------------------===//
1255// Target specific lowering
1256//===---------------------------------------------------------------------===//
1257
1258/// Selects the correct CCAssignFn for a given CallingConvention value.
1260 bool IsVarArg) {
1261 return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg);
1262}
1263
1265 bool IsVarArg) {
1267}
1268
1270 SelectionDAG &DAG,
1271 MachineFrameInfo &MFI,
1272 int ClobberedFI) const {
1273 SmallVector<SDValue, 8> ArgChains;
1274 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
1275 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
1276
1277 // Include the original chain at the beginning of the list. When this is
1278 // used by target LowerCall hooks, this helps legalize find the
1279 // CALLSEQ_BEGIN node.
1280 ArgChains.push_back(Chain);
1281
1282 // Add a chain value for each stack argument corresponding
1283 for (SDNode *U : DAG.getEntryNode().getNode()->uses()) {
1284 if (LoadSDNode *L = dyn_cast<LoadSDNode>(U)) {
1285 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) {
1286 if (FI->getIndex() < 0) {
1287 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
1288 int64_t InLastByte = InFirstByte;
1289 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
1290
1291 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
1292 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
1293 ArgChains.push_back(SDValue(L, 1));
1294 }
1295 }
1296 }
1297 }
1298
1299 // Build a tokenfactor for all the chains.
1300 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
1301}
1302
1305 StringRef Reason) const {
1306 SDValue Callee = CLI.Callee;
1307 SelectionDAG &DAG = CLI.DAG;
1308
1309 const Function &Fn = DAG.getMachineFunction().getFunction();
1310
1311 StringRef FuncName("<unknown>");
1312
1313 if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee))
1314 FuncName = G->getSymbol();
1315 else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
1316 FuncName = G->getGlobal()->getName();
1317
1319 Fn, Reason + FuncName, CLI.DL.getDebugLoc());
1320 DAG.getContext()->diagnose(NoCalls);
1321
1322 if (!CLI.IsTailCall) {
1323 for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
1324 InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT));
1325 }
1326
1327 return DAG.getEntryNode();
1328}
1329
1331 SmallVectorImpl<SDValue> &InVals) const {
1332 return lowerUnhandledCall(CLI, InVals, "unsupported call to function ");
1333}
1334
1336 SelectionDAG &DAG) const {
1337 const Function &Fn = DAG.getMachineFunction().getFunction();
1338
1339 DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "unsupported dynamic alloca",
1340 SDLoc(Op).getDebugLoc());
1341 DAG.getContext()->diagnose(NoDynamicAlloca);
1342 auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
1343 return DAG.getMergeValues(Ops, SDLoc());
1344}
1345
1347 SelectionDAG &DAG) const {
1348 switch (Op.getOpcode()) {
1349 default:
1350 Op->print(errs(), &DAG);
1351 llvm_unreachable("Custom lowering code for this "
1352 "instruction is not implemented yet!");
1353 break;
1355 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
1357 case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
1358 case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
1359 case ISD::FREM: return LowerFREM(Op, DAG);
1360 case ISD::FCEIL: return LowerFCEIL(Op, DAG);
1361 case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
1362 case ISD::FRINT: return LowerFRINT(Op, DAG);
1363 case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
1364 case ISD::FROUNDEVEN:
1365 return LowerFROUNDEVEN(Op, DAG);
1366 case ISD::FROUND: return LowerFROUND(Op, DAG);
1367 case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
1368 case ISD::FLOG2:
1369 return LowerFLOG2(Op, DAG);
1370 case ISD::FLOG:
1371 case ISD::FLOG10:
1372 return LowerFLOGCommon(Op, DAG);
1373 case ISD::FEXP:
1374 case ISD::FEXP10:
1375 return lowerFEXP(Op, DAG);
1376 case ISD::FEXP2:
1377 return lowerFEXP2(Op, DAG);
1378 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
1379 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
1380 case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
1381 case ISD::FP_TO_SINT:
1382 case ISD::FP_TO_UINT:
1383 return LowerFP_TO_INT(Op, DAG);
1384 case ISD::CTTZ:
1386 case ISD::CTLZ:
1388 return LowerCTLZ_CTTZ(Op, DAG);
1390 }
1391 return Op;
1392}
1393
1396 SelectionDAG &DAG) const {
1397 switch (N->getOpcode()) {
1399 // Different parts of legalization seem to interpret which type of
1400 // sign_extend_inreg is the one to check for custom lowering. The extended
1401 // from type is what really matters, but some places check for custom
1402 // lowering of the result type. This results in trying to use
1403 // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
1404 // nothing here and let the illegal result integer be handled normally.
1405 return;
1406 case ISD::FLOG2:
1407 if (SDValue Lowered = LowerFLOG2(SDValue(N, 0), DAG))
1408 Results.push_back(Lowered);
1409 return;
1410 case ISD::FLOG:
1411 case ISD::FLOG10:
1412 if (SDValue Lowered = LowerFLOGCommon(SDValue(N, 0), DAG))
1413 Results.push_back(Lowered);
1414 return;
1415 case ISD::FEXP2:
1416 if (SDValue Lowered = lowerFEXP2(SDValue(N, 0), DAG))
1417 Results.push_back(Lowered);
1418 return;
1419 case ISD::FEXP:
1420 case ISD::FEXP10:
1421 if (SDValue Lowered = lowerFEXP(SDValue(N, 0), DAG))
1422 Results.push_back(Lowered);
1423 return;
1424 case ISD::CTLZ:
1426 if (auto Lowered = lowerCTLZResults(SDValue(N, 0u), DAG))
1427 Results.push_back(Lowered);
1428 return;
1429 default:
1430 return;
1431 }
1432}
1433
1435 SDValue Op,
1436 SelectionDAG &DAG) const {
1437
1438 const DataLayout &DL = DAG.getDataLayout();
1439 GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
1440 const GlobalValue *GV = G->getGlobal();
1441
1442 if (!MFI->isModuleEntryFunction()) {
1443 if (std::optional<uint32_t> Address =
1445 return DAG.getConstant(*Address, SDLoc(Op), Op.getValueType());
1446 }
1447 }
1448
1449 if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
1450 G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
1451 if (!MFI->isModuleEntryFunction() &&
1452 !GV->getName().equals("llvm.amdgcn.module.lds")) {
1453 SDLoc DL(Op);
1454 const Function &Fn = DAG.getMachineFunction().getFunction();
1455 DiagnosticInfoUnsupported BadLDSDecl(
1456 Fn, "local memory global used by non-kernel function",
1457 DL.getDebugLoc(), DS_Warning);
1458 DAG.getContext()->diagnose(BadLDSDecl);
1459
1460 // We currently don't have a way to correctly allocate LDS objects that
1461 // aren't directly associated with a kernel. We do force inlining of
1462 // functions that use local objects. However, if these dead functions are
1463 // not eliminated, we don't want a compile time error. Just emit a warning
1464 // and a trap, since there should be no callable path here.
1465 SDValue Trap = DAG.getNode(ISD::TRAP, DL, MVT::Other, DAG.getEntryNode());
1466 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
1467 Trap, DAG.getRoot());
1468 DAG.setRoot(OutputChain);
1469 return DAG.getUNDEF(Op.getValueType());
1470 }
1471
1472 // XXX: What does the value of G->getOffset() mean?
1473 assert(G->getOffset() == 0 &&
1474 "Do not know what to do with an non-zero offset");
1475
1476 // TODO: We could emit code to handle the initialization somewhere.
1477 // We ignore the initializer for now and legalize it to allow selection.
1478 // The initializer will anyway get errored out during assembly emission.
1479 unsigned Offset = MFI->allocateLDSGlobal(DL, *cast<GlobalVariable>(GV));
1480 return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
1481 }
1482 return SDValue();
1483}
1484
1486 SelectionDAG &DAG) const {
1488 SDLoc SL(Op);
1489
1490 EVT VT = Op.getValueType();
1491 if (VT.getVectorElementType().getSizeInBits() < 32) {
1492 unsigned OpBitSize = Op.getOperand(0).getValueType().getSizeInBits();
1493 if (OpBitSize >= 32 && OpBitSize % 32 == 0) {
1494 unsigned NewNumElt = OpBitSize / 32;
1495 EVT NewEltVT = (NewNumElt == 1) ? MVT::i32
1497 MVT::i32, NewNumElt);
1498 for (const SDUse &U : Op->ops()) {
1499 SDValue In = U.get();
1500 SDValue NewIn = DAG.getNode(ISD::BITCAST, SL, NewEltVT, In);
1501 if (NewNumElt > 1)
1502 DAG.ExtractVectorElements(NewIn, Args);
1503 else
1504 Args.push_back(NewIn);
1505 }
1506
1507 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
1508 NewNumElt * Op.getNumOperands());
1509 SDValue BV = DAG.getBuildVector(NewVT, SL, Args);
1510 return DAG.getNode(ISD::BITCAST, SL, VT, BV);
1511 }
1512 }
1513
1514 for (const SDUse &U : Op->ops())
1515 DAG.ExtractVectorElements(U.get(), Args);
1516
1517 return DAG.getBuildVector(Op.getValueType(), SL, Args);
1518}
1519
1521 SelectionDAG &DAG) const {
1522 SDLoc SL(Op);
1524 unsigned Start = Op.getConstantOperandVal(1);
1525 EVT VT = Op.getValueType();
1526 EVT SrcVT = Op.getOperand(0).getValueType();
1527
1528 if (VT.getScalarSizeInBits() == 16 && Start % 2 == 0) {
1529 unsigned NumElt = VT.getVectorNumElements();
1530 unsigned NumSrcElt = SrcVT.getVectorNumElements();
1531 assert(NumElt % 2 == 0 && NumSrcElt % 2 == 0 && "expect legal types");
1532
1533 // Extract 32-bit registers at a time.
1534 EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumSrcElt / 2);
1535 EVT NewVT = NumElt == 2
1536 ? MVT::i32
1537 : EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElt / 2);
1538 SDValue Tmp = DAG.getNode(ISD::BITCAST, SL, NewSrcVT, Op.getOperand(0));
1539
1540 DAG.ExtractVectorElements(Tmp, Args, Start / 2, NumElt / 2);
1541 if (NumElt == 2)
1542 Tmp = Args[0];
1543 else
1544 Tmp = DAG.getBuildVector(NewVT, SL, Args);
1545
1546 return DAG.getNode(ISD::BITCAST, SL, VT, Tmp);
1547 }
1548
1549 DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
1551
1552 return DAG.getBuildVector(Op.getValueType(), SL, Args);
1553}
1554
1555// TODO: Handle fabs too
1557 if (Val.getOpcode() == ISD::FNEG)
1558 return Val.getOperand(0);
1559
1560 return Val;
1561}
1562
1564 if (Val.getOpcode() == ISD::FNEG)
1565 Val = Val.getOperand(0);
1566 if (Val.getOpcode() == ISD::FABS)
1567 Val = Val.getOperand(0);
1568 if (Val.getOpcode() == ISD::FCOPYSIGN)
1569 Val = Val.getOperand(0);
1570 return Val;
1571}
1572
1574 const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True,
1575 SDValue False, SDValue CC, DAGCombinerInfo &DCI) const {
1576 SelectionDAG &DAG = DCI.DAG;
1577 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1578 switch (CCOpcode) {
1579 case ISD::SETOEQ:
1580 case ISD::SETONE:
1581 case ISD::SETUNE:
1582 case ISD::SETNE:
1583 case ISD::SETUEQ:
1584 case ISD::SETEQ:
1585 case ISD::SETFALSE:
1586 case ISD::SETFALSE2:
1587 case ISD::SETTRUE:
1588 case ISD::SETTRUE2:
1589 case ISD::SETUO:
1590 case ISD::SETO:
1591 break;
1592 case ISD::SETULE:
1593 case ISD::SETULT: {
1594 if (LHS == True)
1595 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1596 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1597 }
1598 case ISD::SETOLE:
1599 case ISD::SETOLT:
1600 case ISD::SETLE:
1601 case ISD::SETLT: {
1602 // Ordered. Assume ordered for undefined.
1603
1604 // Only do this after legalization to avoid interfering with other combines
1605 // which might occur.
1607 !DCI.isCalledByLegalizer())
1608 return SDValue();
1609
1610 // We need to permute the operands to get the correct NaN behavior. The
1611 // selected operand is the second one based on the failing compare with NaN,
1612 // so permute it based on the compare type the hardware uses.
1613 if (LHS == True)
1614 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1615 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1616 }
1617 case ISD::SETUGE:
1618 case ISD::SETUGT: {
1619 if (LHS == True)
1620 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1621 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1622 }
1623 case ISD::SETGT:
1624 case ISD::SETGE:
1625 case ISD::SETOGE:
1626 case ISD::SETOGT: {
1628 !DCI.isCalledByLegalizer())
1629 return SDValue();
1630
1631 if (LHS == True)
1632 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1633 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1634 }
1635 case ISD::SETCC_INVALID:
1636 llvm_unreachable("Invalid setcc condcode!");
1637 }
1638 return SDValue();
1639}
1640
1641/// Generate Min/Max node
1643 SDValue LHS, SDValue RHS,
1644 SDValue True, SDValue False,
1645 SDValue CC,
1646 DAGCombinerInfo &DCI) const {
1647 if ((LHS == True && RHS == False) || (LHS == False && RHS == True))
1648 return combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, True, False, CC, DCI);
1649
1650 SelectionDAG &DAG = DCI.DAG;
1651
1652 // If we can't directly match this, try to see if we can fold an fneg to
1653 // match.
1654
1655 ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
1656 ConstantFPSDNode *CFalse = dyn_cast<ConstantFPSDNode>(False);
1657 SDValue NegTrue = peekFNeg(True);
1658
1659 // Undo the combine foldFreeOpFromSelect does if it helps us match the
1660 // fmin/fmax.
1661 //
1662 // select (fcmp olt (lhs, K)), (fneg lhs), -K
1663 // -> fneg (fmin_legacy lhs, K)
1664 //
1665 // TODO: Use getNegatedExpression
1666 if (LHS == NegTrue && CFalse && CRHS) {
1667 APFloat NegRHS = neg(CRHS->getValueAPF());
1668 if (NegRHS == CFalse->getValueAPF()) {
1669 SDValue Combined =
1670 combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, NegTrue, False, CC, DCI);
1671 if (Combined)
1672 return DAG.getNode(ISD::FNEG, DL, VT, Combined);
1673 return SDValue();
1674 }
1675 }
1676
1677 return SDValue();
1678}
1679
1680std::pair<SDValue, SDValue>
1682 SDLoc SL(Op);
1683
1684 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1685
1686 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1687 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1688
1689 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1690 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1691
1692 return std::pair(Lo, Hi);
1693}
1694
1696 SDLoc SL(Op);
1697
1698 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1699 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1700 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1701}
1702
1704 SDLoc SL(Op);
1705
1706 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1707 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1708 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1709}
1710
1711// Split a vector type into two parts. The first part is a power of two vector.
1712// The second part is whatever is left over, and is a scalar if it would
1713// otherwise be a 1-vector.
1714std::pair<EVT, EVT>
1716 EVT LoVT, HiVT;
1717 EVT EltVT = VT.getVectorElementType();
1718 unsigned NumElts = VT.getVectorNumElements();
1719 unsigned LoNumElts = PowerOf2Ceil((NumElts + 1) / 2);
1720 LoVT = EVT::getVectorVT(*DAG.getContext(), EltVT, LoNumElts);
1721 HiVT = NumElts - LoNumElts == 1
1722 ? EltVT
1723 : EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts - LoNumElts);
1724 return std::pair(LoVT, HiVT);
1725}
1726
1727// Split a vector value into two parts of types LoVT and HiVT. HiVT could be
1728// scalar.
1729std::pair<SDValue, SDValue>
1731 const EVT &LoVT, const EVT &HiVT,
1732 SelectionDAG &DAG) const {
1734 (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <=
1735 N.getValueType().getVectorNumElements() &&
1736 "More vector elements requested than available!");
1738 DAG.getVectorIdxConstant(0, DL));
1739 SDValue Hi = DAG.getNode(
1741 HiVT, N, DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), DL));
1742 return std::pair(Lo, Hi);
1743}
1744
1746 SelectionDAG &DAG) const {
1747 LoadSDNode *Load = cast<LoadSDNode>(Op);
1748 EVT VT = Op.getValueType();
1749 SDLoc SL(Op);
1750
1751
1752 // If this is a 2 element vector, we really want to scalarize and not create
1753 // weird 1 element vectors.
1754 if (VT.getVectorNumElements() == 2) {
1755 SDValue Ops[2];
1756 std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG);
1757 return DAG.getMergeValues(Ops, SL);
1758 }
1759
1760 SDValue BasePtr = Load->getBasePtr();
1761 EVT MemVT = Load->getMemoryVT();
1762
1763 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1764
1765 EVT LoVT, HiVT;
1766 EVT LoMemVT, HiMemVT;
1767 SDValue Lo, Hi;
1768
1769 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1770 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1771 std::tie(Lo, Hi) = splitVector(Op, SL, LoVT, HiVT, DAG);
1772
1773 unsigned Size = LoMemVT.getStoreSize();
1774 Align BaseAlign = Load->getAlign();
1775 Align HiAlign = commonAlignment(BaseAlign, Size);
1776
1777 SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
1778 Load->getChain(), BasePtr, SrcValue, LoMemVT,
1779 BaseAlign, Load->getMemOperand()->getFlags());
1780 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Size));
1781 SDValue HiLoad =
1782 DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(),
1783 HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()),
1784 HiMemVT, HiAlign, Load->getMemOperand()->getFlags());
1785
1786 SDValue Join;
1787 if (LoVT == HiVT) {
1788 // This is the case that the vector is power of two so was evenly split.
1789 Join = DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad);
1790 } else {
1791 Join = DAG.getNode(ISD::INSERT_SUBVECTOR, SL, VT, DAG.getUNDEF(VT), LoLoad,
1792 DAG.getVectorIdxConstant(0, SL));
1793 Join = DAG.getNode(
1795 VT, Join, HiLoad,
1797 }
1798
1799 SDValue Ops[] = {Join, DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
1800 LoLoad.getValue(1), HiLoad.getValue(1))};
1801
1802 return DAG.getMergeValues(Ops, SL);
1803}
1804
1806 SelectionDAG &DAG) const {
1807 LoadSDNode *Load = cast<LoadSDNode>(Op);
1808 EVT VT = Op.getValueType();
1809 SDValue BasePtr = Load->getBasePtr();
1810 EVT MemVT = Load->getMemoryVT();
1811 SDLoc SL(Op);
1812 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1813 Align BaseAlign = Load->getAlign();
1814 unsigned NumElements = MemVT.getVectorNumElements();
1815
1816 // Widen from vec3 to vec4 when the load is at least 8-byte aligned
1817 // or 16-byte fully dereferenceable. Otherwise, split the vector load.
1818 if (NumElements != 3 ||
1819 (BaseAlign < Align(8) &&
1820 !SrcValue.isDereferenceable(16, *DAG.getContext(), DAG.getDataLayout())))
1821 return SplitVectorLoad(Op, DAG);
1822
1823 assert(NumElements == 3);
1824
1825 EVT WideVT =
1827 EVT WideMemVT =
1829 SDValue WideLoad = DAG.getExtLoad(
1830 Load->getExtensionType(), SL, WideVT, Load->getChain(), BasePtr, SrcValue,
1831 WideMemVT, BaseAlign, Load->getMemOperand()->getFlags());
1832 return DAG.getMergeValues(
1833 {DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, VT, WideLoad,
1834 DAG.getVectorIdxConstant(0, SL)),
1835 WideLoad.getValue(1)},
1836 SL);
1837}
1838
1840 SelectionDAG &DAG) const {
1841 StoreSDNode *Store = cast<StoreSDNode>(Op);
1842 SDValue Val = Store->getValue();
1843 EVT VT = Val.getValueType();
1844
1845 // If this is a 2 element vector, we really want to scalarize and not create
1846 // weird 1 element vectors.
1847 if (VT.getVectorNumElements() == 2)
1848 return scalarizeVectorStore(Store, DAG);
1849
1850 EVT MemVT = Store->getMemoryVT();
1851 SDValue Chain = Store->getChain();
1852 SDValue BasePtr = Store->getBasePtr();
1853 SDLoc SL(Op);
1854
1855 EVT LoVT, HiVT;
1856 EVT LoMemVT, HiMemVT;
1857 SDValue Lo, Hi;
1858
1859 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1860 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1861 std::tie(Lo, Hi) = splitVector(Val, SL, LoVT, HiVT, DAG);
1862
1863 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize());
1864
1865 const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
1866 Align BaseAlign = Store->getAlign();
1867 unsigned Size = LoMemVT.getStoreSize();
1868 Align HiAlign = commonAlignment(BaseAlign, Size);
1869
1870 SDValue LoStore =
1871 DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
1872 Store->getMemOperand()->getFlags());
1873 SDValue HiStore =
1874 DAG.getTruncStore(Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size),
1875 HiMemVT, HiAlign, Store->getMemOperand()->getFlags());
1876
1877 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
1878}
1879
1880// This is a shortcut for integer division because we have fast i32<->f32
1881// conversions, and fast f32 reciprocal instructions. The fractional part of a
1882// float is enough to accurately represent up to a 24-bit signed integer.
1884 bool Sign) const {
1885 SDLoc DL(Op);
1886 EVT VT = Op.getValueType();
1887 SDValue LHS = Op.getOperand(0);
1888 SDValue RHS = Op.getOperand(1);
1889 MVT IntVT = MVT::i32;
1890 MVT FltVT = MVT::f32;
1891
1892 unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS);
1893 if (LHSSignBits < 9)
1894 return SDValue();
1895
1896 unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS);
1897 if (RHSSignBits < 9)
1898 return SDValue();
1899
1900 unsigned BitSize = VT.getSizeInBits();
1901 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1902 unsigned DivBits = BitSize - SignBits;
1903 if (Sign)
1904 ++DivBits;
1905
1908
1909 SDValue jq = DAG.getConstant(1, DL, IntVT);
1910
1911 if (Sign) {
1912 // char|short jq = ia ^ ib;
1913 jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
1914
1915 // jq = jq >> (bitsize - 2)
1916 jq = DAG.getNode(ISD::SRA, DL, VT, jq,
1917 DAG.getConstant(BitSize - 2, DL, VT));
1918
1919 // jq = jq | 0x1
1920 jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));
1921 }
1922
1923 // int ia = (int)LHS;
1924 SDValue ia = LHS;
1925
1926 // int ib, (int)RHS;
1927 SDValue ib = RHS;
1928
1929 // float fa = (float)ia;
1930 SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
1931
1932 // float fb = (float)ib;
1933 SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
1934
1935 SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
1936 fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
1937
1938 // fq = trunc(fq);
1939 fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);
1940
1941 // float fqneg = -fq;
1942 SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
1943
1945
1946 bool UseFmadFtz = false;
1947 if (Subtarget->isGCN()) {
1949 UseFmadFtz =
1951 }
1952
1953 // float fr = mad(fqneg, fb, fa);
1954 unsigned OpCode = !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
1955 : UseFmadFtz ? (unsigned)AMDGPUISD::FMAD_FTZ
1957 SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa);
1958
1959 // int iq = (int)fq;
1960 SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
1961
1962 // fr = fabs(fr);
1963 fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
1964
1965 // fb = fabs(fb);
1966 fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
1967
1968 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
1969
1970 // int cv = fr >= fb;
1971 SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
1972
1973 // jq = (cv ? jq : 0);
1974 jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));
1975
1976 // dst = iq + jq;
1977 SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
1978
1979 // Rem needs compensation, it's easier to recompute it
1980 SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
1981 Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
1982
1983 // Truncate to number of bits this divide really is.
1984 if (Sign) {
1985 SDValue InRegSize
1986 = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits));
1987 Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize);
1988 Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize);
1989 } else {
1990 SDValue TruncMask = DAG.getConstant((UINT64_C(1) << DivBits) - 1, DL, VT);
1991 Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask);
1992 Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask);
1993 }
1994
1995 return DAG.getMergeValues({ Div, Rem }, DL);
1996}
1997
1999 SelectionDAG &DAG,
2001 SDLoc DL(Op);
2002 EVT VT = Op.getValueType();
2003
2004 assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64");
2005
2006 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
2007
2008 SDValue One = DAG.getConstant(1, DL, HalfVT);
2009 SDValue Zero = DAG.getConstant(0, DL, HalfVT);
2010
2011 //HiLo split
2012 SDValue LHS_Lo, LHS_Hi;
2013 SDValue LHS = Op.getOperand(0);
2014 std::tie(LHS_Lo, LHS_Hi) = DAG.SplitScalar(LHS, DL, HalfVT, HalfVT);
2015
2016 SDValue RHS_Lo, RHS_Hi;
2017 SDValue RHS = Op.getOperand(1);
2018 std::tie(RHS_Lo, RHS_Hi) = DAG.SplitScalar(RHS, DL, HalfVT, HalfVT);
2019
2020 if (DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
2022
2023 SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2024 LHS_Lo, RHS_Lo);
2025
2026 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), Zero});
2027 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), Zero});
2028
2029 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));
2030 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));
2031 return;
2032 }
2033
2034 if (isTypeLegal(MVT::i64)) {
2035 // The algorithm here is based on ideas from "Software Integer Division",
2036 // Tom Rodeheffer, August 2008.
2037
2040
2041 // Compute denominator reciprocal.
2042 unsigned FMAD =
2043 !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
2046 : (unsigned)AMDGPUISD::FMAD_FTZ;
2047
2048 SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo);
2049 SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi);
2050 SDValue Mad1 = DAG.getNode(FMAD, DL, MVT::f32, Cvt_Hi,
2051 DAG.getConstantFP(APInt(32, 0x4f800000).bitsToFloat(), DL, MVT::f32),
2052 Cvt_Lo);
2053 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, DL, MVT::f32, Mad1);
2054 SDValue Mul1 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Rcp,
2055 DAG.getConstantFP(APInt(32, 0x5f7ffffc).bitsToFloat(), DL, MVT::f32));
2056 SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Mul1,
2057 DAG.getConstantFP(APInt(32, 0x2f800000).bitsToFloat(), DL, MVT::f32));
2058 SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, MVT::f32, Mul2);
2059 SDValue Mad2 = DAG.getNode(FMAD, DL, MVT::f32, Trunc,
2060 DAG.getConstantFP(APInt(32, 0xcf800000).bitsToFloat(), DL, MVT::f32),
2061 Mul1);
2062 SDValue Rcp_Lo = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Mad2);
2063 SDValue Rcp_Hi = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Trunc);
2064 SDValue Rcp64 = DAG.getBitcast(VT,
2065 DAG.getBuildVector(MVT::v2i32, DL, {Rcp_Lo, Rcp_Hi}));
2066
2067 SDValue Zero64 = DAG.getConstant(0, DL, VT);
2068 SDValue One64 = DAG.getConstant(1, DL, VT);
2069 SDValue Zero1 = DAG.getConstant(0, DL, MVT::i1);
2070 SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1);
2071
2072 // First round of UNR (Unsigned integer Newton-Raphson).
2073 SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS);
2074 SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64);
2075 SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1);
2076 SDValue Mulhi1_Lo, Mulhi1_Hi;
2077 std::tie(Mulhi1_Lo, Mulhi1_Hi) =
2078 DAG.SplitScalar(Mulhi1, DL, HalfVT, HalfVT);
2079 SDValue Add1_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Lo,
2080 Mulhi1_Lo, Zero1);
2081 SDValue Add1_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Hi,
2082 Mulhi1_Hi, Add1_Lo.getValue(1));
2083 SDValue Add1 = DAG.getBitcast(VT,
2084 DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi}));
2085
2086 // Second round of UNR.
2087 SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1);
2088 SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2);
2089 SDValue Mulhi2_Lo, Mulhi2_Hi;
2090 std::tie(Mulhi2_Lo, Mulhi2_Hi) =
2091 DAG.SplitScalar(Mulhi2, DL, HalfVT, HalfVT);
2092 SDValue Add2_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Lo,
2093 Mulhi2_Lo, Zero1);
2094 SDValue Add2_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Hi,
2095 Mulhi2_Hi, Add2_Lo.getValue(1));
2096 SDValue Add2 = DAG.getBitcast(VT,
2097 DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi}));
2098
2099 SDValue Mulhi3 = DAG.getNode(ISD::MULHU, DL, VT, LHS, Add2);
2100
2101 SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3);
2102
2103 SDValue Mul3_Lo, Mul3_Hi;
2104 std::tie(Mul3_Lo, Mul3_Hi) = DAG.SplitScalar(Mul3, DL, HalfVT, HalfVT);
2105 SDValue Sub1_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Lo,
2106 Mul3_Lo, Zero1);
2107 SDValue Sub1_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Hi,
2108 Mul3_Hi, Sub1_Lo.getValue(1));
2109 SDValue Sub1_Mi = DAG.getNode(ISD::SUB, DL, HalfVT, LHS_Hi, Mul3_Hi);
2110 SDValue Sub1 = DAG.getBitcast(VT,
2111 DAG.getBuildVector(MVT::v2i32, DL, {Sub1_Lo, Sub1_Hi}));
2112
2113 SDValue MinusOne = DAG.getConstant(0xffffffffu, DL, HalfVT);
2114 SDValue C1 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, MinusOne, Zero,
2115 ISD::SETUGE);
2116 SDValue C2 = DAG.getSelectCC(DL, Sub1_Lo, RHS_Lo, MinusOne, Zero,
2117 ISD::SETUGE);
2118 SDValue C3 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, C2, C1, ISD::SETEQ);
2119
2120 // TODO: Here and below portions of the code can be enclosed into if/endif.
2121 // Currently control flow is unconditional and we have 4 selects after
2122 // potential endif to substitute PHIs.
2123
2124 // if C3 != 0 ...
2125 SDValue Sub2_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Lo,
2126 RHS_Lo, Zero1);
2127 SDValue Sub2_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Mi,
2128 RHS_Hi, Sub1_Lo.getValue(1));
2129 SDValue Sub2_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi,
2130 Zero, Sub2_Lo.getValue(1));
2131 SDValue Sub2 = DAG.getBitcast(VT,
2132 DAG.getBuildVector(MVT::v2i32, DL, {Sub2_Lo, Sub2_Hi}));
2133
2134 SDValue Add3 = DAG.getNode(ISD::ADD, DL, VT, Mulhi3, One64);
2135
2136 SDValue C4 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, MinusOne, Zero,
2137 ISD::SETUGE);
2138 SDValue C5 = DAG.getSelectCC(DL, Sub2_Lo, RHS_Lo, MinusOne, Zero,
2139 ISD::SETUGE);
2140 SDValue C6 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, C5, C4, ISD::SETEQ);
2141
2142 // if (C6 != 0)
2143 SDValue Add4 = DAG.getNode(ISD::ADD, DL, VT, Add3, One64);
2144
2145 SDValue Sub3_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Lo,
2146 RHS_Lo, Zero1);
2147 SDValue Sub3_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi,
2148 RHS_Hi, Sub2_Lo.getValue(1));
2149 SDValue Sub3_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub3_Mi,
2150 Zero, Sub3_Lo.getValue(1));
2151 SDValue Sub3 = DAG.getBitcast(VT,
2152 DAG.getBuildVector(MVT::v2i32, DL, {Sub3_Lo, Sub3_Hi}));
2153
2154 // endif C6
2155 // endif C3
2156
2157 SDValue Sel1 = DAG.getSelectCC(DL, C6, Zero, Add4, Add3, ISD::SETNE);
2158 SDValue Div = DAG.getSelectCC(DL, C3, Zero, Sel1, Mulhi3, ISD::SETNE);
2159
2160 SDValue Sel2 = DAG.getSelectCC(DL, C6, Zero, Sub3, Sub2, ISD::SETNE);
2161 SDValue Rem = DAG.getSelectCC(DL, C3, Zero, Sel2, Sub1, ISD::SETNE);
2162
2163 Results.push_back(Div);
2164 Results.push_back(Rem);
2165
2166 return;
2167 }
2168
2169 // r600 expandion.
2170 // Get Speculative values
2171 SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
2172 SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
2173
2174 SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, Zero, REM_Part, LHS_Hi, ISD::SETEQ);
2175 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, Zero});
2176 REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);
2177
2178 SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, Zero, DIV_Part, Zero, ISD::SETEQ);
2179 SDValue DIV_Lo = Zero;
2180
2181 const unsigned halfBitWidth = HalfVT.getSizeInBits();
2182
2183 for (unsigned i = 0; i < halfBitWidth; ++i) {
2184 const unsigned bitPos = halfBitWidth - i - 1;
2185 SDValue POS = DAG.getConstant(bitPos, DL, HalfVT);
2186 // Get value of high bit
2187 SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
2188 HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, One);
2189 HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
2190
2191 // Shift
2192 REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT));
2193 // Add LHS high bit
2194 REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
2195
2196 SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT);
2197 SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, Zero, ISD::SETUGE);
2198
2199 DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
2200
2201 // Update REM
2202 SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
2203 REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
2204 }
2205
2206 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});
2207 DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);
2208 Results.push_back(DIV);
2209 Results.push_back(REM);
2210}
2211
2213 SelectionDAG &DAG) const {
2214 SDLoc DL(Op);
2215 EVT VT = Op.getValueType();
2216
2217 if (VT == MVT::i64) {
2219 LowerUDIVREM64(Op, DAG, Results);
2220 return DAG.getMergeValues(Results, DL);
2221 }
2222
2223 if (VT == MVT::i32) {
2224 if (SDValue Res = LowerDIVREM24(Op, DAG, false))
2225 return Res;
2226 }
2227
2228 SDValue X = Op.getOperand(0);
2229 SDValue Y = Op.getOperand(1);
2230
2231 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2232 // algorithm used here.
2233
2234 // Initial estimate of inv(y).
2235 SDValue Z = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Y);
2236
2237 // One round of UNR.
2238 SDValue NegY = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Y);
2239 SDValue NegYZ = DAG.getNode(ISD::MUL, DL, VT, NegY, Z);
2240 Z = DAG.getNode(ISD::ADD, DL, VT, Z,
2241 DAG.getNode(ISD::MULHU, DL, VT, Z, NegYZ));
2242
2243 // Quotient/remainder estimate.
2244 SDValue Q = DAG.getNode(ISD::MULHU, DL, VT, X, Z);
2245 SDValue R =
2246 DAG.getNode(ISD::SUB, DL, VT, X, DAG.getNode(ISD::MUL, DL, VT, Q, Y));
2247
2248 // First quotient/remainder refinement.
2249 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2250 SDValue One = DAG.getConstant(1, DL, VT);
2251 SDValue Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2252 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2253 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2254 R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2255 DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2256
2257 // Second quotient/remainder refinement.
2258 Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2259 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2260 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2261 R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2262 DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2263
2264 return DAG.getMergeValues({Q, R}, DL);
2265}
2266
2268 SelectionDAG &DAG) const {
2269 SDLoc DL(Op);
2270 EVT VT = Op.getValueType();
2271
2272 SDValue LHS = Op.getOperand(0);
2273 SDValue RHS = Op.getOperand(1);
2274
2275 SDValue Zero = DAG.getConstant(0, DL, VT);
2276 SDValue NegOne = DAG.getConstant(-1, DL, VT);
2277
2278 if (VT == MVT::i32) {
2279 if (SDValue Res = LowerDIVREM24(Op, DAG, true))
2280 return Res;
2281 }
2282
2283 if (VT == MVT::i64 &&
2284 DAG.ComputeNumSignBits(LHS) > 32 &&
2285 DAG.ComputeNumSignBits(RHS) > 32) {
2286 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
2287
2288 //HiLo split
2289 SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
2290 SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
2291 SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2292 LHS_Lo, RHS_Lo);
2293 SDValue Res[2] = {
2294 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
2295 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
2296 };
2297 return DAG.getMergeValues(Res, DL);
2298 }
2299
2300 SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
2301 SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
2302 SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
2303 SDValue RSign = LHSign; // Remainder sign is the same as LHS
2304
2305 LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
2306 RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
2307
2308 LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
2309 RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
2310
2311 SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
2312 SDValue Rem = Div.getValue(1);
2313
2314 Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
2315 Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
2316
2317 Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
2318 Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
2319
2320 SDValue Res[2] = {
2321 Div,
2322 Rem
2323 };
2324 return DAG.getMergeValues(Res, DL);
2325}
2326
2327// (frem x, y) -> (fma (fneg (ftrunc (fdiv x, y))), y, x)
2329 SDLoc SL(Op);
2330 EVT VT = Op.getValueType();
2331 auto Flags = Op->getFlags();
2332 SDValue X = Op.getOperand(0);
2333 SDValue Y = Op.getOperand(1);
2334
2335 SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y, Flags);
2336 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, VT, Div, Flags);
2337 SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Trunc, Flags);
2338 // TODO: For f32 use FMAD instead if !hasFastFMA32?
2339 return DAG.getNode(ISD::FMA, SL, VT, Neg, Y, X, Flags);
2340}
2341
2343 SDLoc SL(Op);
2344 SDValue Src = Op.getOperand(0);
2345
2346 // result = trunc(src)
2347 // if (src > 0.0 && src != result)
2348 // result += 1.0
2349
2350 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2351
2352 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2353 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
2354
2355 EVT SetCCVT =
2356 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2357
2358 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
2359 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2360 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2361
2362 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
2363 // TODO: Should this propagate fast-math-flags?
2364 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2365}
2366
2368 SelectionDAG &DAG) {
2369 const unsigned FractBits = 52;
2370 const unsigned ExpBits = 11;
2371
2372 SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
2373 Hi,
2374 DAG.getConstant(FractBits - 32, SL, MVT::i32),
2375 DAG.getConstant(ExpBits, SL, MVT::i32));
2376 SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
2377 DAG.getConstant(1023, SL, MVT::i32));
2378
2379 return Exp;
2380}
2381
2383 SDLoc SL(Op);
2384 SDValue Src = Op.getOperand(0);
2385
2386 assert(Op.getValueType() == MVT::f64);
2387
2388 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2389
2390 // Extract the upper half, since this is where we will find the sign and
2391 // exponent.
2392 SDValue Hi = getHiHalf64(Src, DAG);
2393
2394 SDValue Exp = extractF64Exponent(Hi, SL, DAG);
2395
2396 const unsigned FractBits = 52;
2397
2398 // Extract the sign bit.
2399 const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32);
2400 SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
2401
2402 // Extend back to 64-bits.
2403 SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
2404 SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
2405
2406 SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
2407 const SDValue FractMask
2408 = DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64);
2409
2410 SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
2411 SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
2412 SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
2413
2414 EVT SetCCVT =
2415 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
2416
2417 const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);
2418
2419 SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
2420 SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
2421
2422 SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
2423 SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
2424
2425 return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
2426}
2427
2429 SelectionDAG &DAG) const {
2430 SDLoc SL(Op);
2431 SDValue Src = Op.getOperand(0);
2432
2433 assert(Op.getValueType() == MVT::f64);
2434
2435 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2436 SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
2437 SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
2438
2439 // TODO: Should this propagate fast-math-flags?
2440
2441 SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
2442 SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
2443
2444 SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
2445
2446 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2447 SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
2448
2449 EVT SetCCVT =
2450 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2451 SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
2452
2453 return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
2454}
2455
2457 SelectionDAG &DAG) const {
2458 // FNEARBYINT and FRINT are the same, except in their handling of FP
2459 // exceptions. Those aren't really meaningful for us, and OpenCL only has
2460 // rint, so just treat them as equivalent.
2461 return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), Op.getValueType(),
2462 Op.getOperand(0));
2463}
2464
2466 auto VT = Op.getValueType();
2467 auto Arg = Op.getOperand(0u);
2468 return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), VT, Arg);
2469}
2470
2471// XXX - May require not supporting f32 denormals?
2472
2473// Don't handle v2f16. The extra instructions to scalarize and repack around the
2474// compare and vselect end up producing worse code than scalarizing the whole
2475// operation.
2477 SDLoc SL(Op);
2478 SDValue X = Op.getOperand(0);
2479 EVT VT = Op.getValueType();
2480
2481 SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X);
2482
2483 // TODO: Should this propagate fast-math-flags?
2484
2485 SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T);
2486
2487 SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff);
2488
2489 const SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2490 const SDValue One = DAG.getConstantFP(1.0, SL, VT);
2491
2492 EVT SetCCVT =
2493 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2494
2495 const SDValue Half = DAG.getConstantFP(0.5, SL, VT);
2496 SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
2497 SDValue OneOrZeroFP = DAG.getNode(ISD::SELECT, SL, VT, Cmp, One, Zero);
2498
2499 SDValue SignedOffset = DAG.getNode(ISD::FCOPYSIGN, SL, VT, OneOrZeroFP, X);
2500 return DAG.getNode(ISD::FADD, SL, VT, T, SignedOffset);
2501}
2502
2504 SDLoc SL(Op);
2505 SDValue Src = Op.getOperand(0);
2506
2507 // result = trunc(src);
2508 // if (src < 0.0 && src != result)
2509 // result += -1.0.
2510
2511 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2512
2513 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2514 const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);
2515
2516 EVT SetCCVT =
2517 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2518
2519 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
2520 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2521 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2522
2523 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
2524 // TODO: Should this propagate fast-math-flags?
2525 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2526}
2527
2528/// Return true if it's known that \p Src can never be an f32 denormal value.
2530 switch (Src.getOpcode()) {
2531 case ISD::FP_EXTEND:
2532 return Src.getOperand(0).getValueType() == MVT::f16;
2533 case ISD::FP16_TO_FP:
2534 case ISD::FFREXP:
2535 return true;
2537 unsigned IntrinsicID = Src.getConstantOperandVal(0);
2538 switch (IntrinsicID) {
2539 case Intrinsic::amdgcn_frexp_mant:
2540 return true;
2541 default:
2542 return false;
2543 }
2544 }
2545 default:
2546 return false;
2547 }
2548
2549 llvm_unreachable("covered opcode switch");
2550}
2551
2553 SDNodeFlags Flags) {
2554 if (Flags.hasApproximateFuncs())
2555 return true;
2556 auto &Options = DAG.getTarget().Options;
2557 return Options.UnsafeFPMath || Options.ApproxFuncFPMath;
2558}
2559
2561 SDValue Src,
2562 SDNodeFlags Flags) {
2563 return !valueIsKnownNeverF32Denorm(Src) &&
2564 DAG.getMachineFunction()
2567}
2568
2570 SDValue Src,
2571 SDNodeFlags Flags) const {
2572 SDLoc SL(Src);
2573 EVT VT = Src.getValueType();
2575 SDValue SmallestNormal =
2576 DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT);
2577
2578 // Want to scale denormals up, but negatives and 0 work just as well on the
2579 // scaled path.
2580 SDValue IsLtSmallestNormal = DAG.getSetCC(
2581 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src,
2582 SmallestNormal, ISD::SETOLT);
2583
2584 return IsLtSmallestNormal;
2585}
2586
2588 SDNodeFlags Flags) const {
2589 SDLoc SL(Src);
2590 EVT VT = Src.getValueType();
2592 SDValue Inf = DAG.getConstantFP(APFloat::getInf(Semantics), SL, VT);
2593
2594 SDValue Fabs = DAG.getNode(ISD::FABS, SL, VT, Src, Flags);
2595 SDValue IsFinite = DAG.getSetCC(
2596 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Fabs,
2597 Inf, ISD::SETOLT);
2598 return IsFinite;
2599}
2600
2601/// If denormal handling is required return the scaled input to FLOG2, and the
2602/// check for denormal range. Otherwise, return null values.
2603std::pair<SDValue, SDValue>
2605 SDValue Src, SDNodeFlags Flags) const {
2606 if (!needsDenormHandlingF32(DAG, Src, Flags))
2607 return {};
2608
2609 MVT VT = MVT::f32;
2610 const fltSemantics &Semantics = APFloat::IEEEsingle();
2611 SDValue SmallestNormal =
2612 DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT);
2613
2614 SDValue IsLtSmallestNormal = DAG.getSetCC(
2615 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src,
2616 SmallestNormal, ISD::SETOLT);
2617
2618 SDValue Scale32 = DAG.getConstantFP(0x1.0p+32, SL, VT);
2619 SDValue One = DAG.getConstantFP(1.0, SL, VT);
2620 SDValue ScaleFactor =
2621 DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, Scale32, One, Flags);
2622
2623 SDValue ScaledInput = DAG.getNode(ISD::FMUL, SL, VT, Src, ScaleFactor, Flags);
2624 return {ScaledInput, IsLtSmallestNormal};
2625}
2626
2628 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
2629 // If we have to handle denormals, scale up the input and adjust the result.
2630
2631 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
2632 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
2633
2634 SDLoc SL(Op);
2635 EVT VT = Op.getValueType();
2636 SDValue Src = Op.getOperand(0);
2637 SDNodeFlags Flags = Op->getFlags();
2638
2639 if (VT == MVT::f16) {
2640 // Nothing in half is a denormal when promoted to f32.
2641 assert(!Subtarget->has16BitInsts());
2642 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);
2643 SDValue Log = DAG.getNode(AMDGPUISD::LOG, SL, MVT::f32, Ext, Flags);
2644 return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,
2645 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
2646 }
2647
2648 auto [ScaledInput, IsLtSmallestNormal] =
2649 getScaledLogInput(DAG, SL, Src, Flags);
2650 if (!ScaledInput)
2651 return DAG.getNode(AMDGPUISD::LOG, SL, VT, Src, Flags);
2652
2653 SDValue Log2 = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags);
2654
2655 SDValue ThirtyTwo = DAG.getConstantFP(32.0, SL, VT);
2656 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2657 SDValue ResultOffset =
2658 DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, ThirtyTwo, Zero);
2659 return DAG.getNode(ISD::FSUB, SL, VT, Log2, ResultOffset, Flags);
2660}
2661
2662static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X,
2663 SDValue Y, SDValue C, SDNodeFlags Flags = SDNodeFlags()) {
2664 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Y, Flags);
2665 return DAG.getNode(ISD::FADD, SL, VT, Mul, C, Flags);
2666}
2667
2669 SelectionDAG &DAG) const {
2670 SDValue X = Op.getOperand(0);
2671 EVT VT = Op.getValueType();
2672 SDNodeFlags Flags = Op->getFlags();
2673 SDLoc DL(Op);
2674
2675 const bool IsLog10 = Op.getOpcode() == ISD::FLOG10;
2676 assert(IsLog10 || Op.getOpcode() == ISD::FLOG);
2677
2678 const auto &Options = getTargetMachine().Options;
2679 if (VT == MVT::f16 || Flags.hasApproximateFuncs() ||
2680 Options.ApproxFuncFPMath || Options.UnsafeFPMath) {
2681
2682 if (VT == MVT::f16 && !Subtarget->has16BitInsts()) {
2683 // Log and multiply in f32 is good enough for f16.
2684 X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X, Flags);
2685 }
2686
2687 SDValue Lowered = LowerFLOGUnsafe(X, DL, DAG, IsLog10, Flags);
2688 if (VT == MVT::f16 && !Subtarget->has16BitInsts()) {
2689 return DAG.getNode(ISD::FP_ROUND, DL, VT, Lowered,
2690 DAG.getTargetConstant(0, DL, MVT::i32), Flags);
2691 }
2692
2693 return Lowered;
2694 }
2695
2696 auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, DL, X, Flags);
2697 if (ScaledInput)
2698 X = ScaledInput;
2699
2700 SDValue Y = DAG.getNode(AMDGPUISD::LOG, DL, VT, X, Flags);
2701
2702 SDValue R;
2703 if (Subtarget->hasFastFMAF32()) {
2704 // c+cc are ln(2)/ln(10) to more than 49 bits
2705 const float c_log10 = 0x1.344134p-2f;
2706 const float cc_log10 = 0x1.09f79ep-26f;
2707
2708 // c + cc is ln(2) to more than 49 bits
2709 const float c_log = 0x1.62e42ep-1f;
2710 const float cc_log = 0x1.efa39ep-25f;
2711
2712 SDValue C = DAG.getConstantFP(IsLog10 ? c_log10 : c_log, DL, VT);
2713 SDValue CC = DAG.getConstantFP(IsLog10 ? cc_log10 : cc_log, DL, VT);
2714
2715 R = DAG.getNode(ISD::FMUL, DL, VT, Y, C, Flags);
2716 SDValue NegR = DAG.getNode(ISD::FNEG, DL, VT, R, Flags);
2717 SDValue FMA0 = DAG.getNode(ISD::FMA, DL, VT, Y, C, NegR, Flags);
2718 SDValue FMA1 = DAG.getNode(ISD::FMA, DL, VT, Y, CC, FMA0, Flags);
2719 R = DAG.getNode(ISD::FADD, DL, VT, R, FMA1, Flags);
2720 } else {
2721 // ch+ct is ln(2)/ln(10) to more than 36 bits
2722 const float ch_log10 = 0x1.344000p-2f;
2723 const float ct_log10 = 0x1.3509f6p-18f;
2724
2725 // ch + ct is ln(2) to more than 36 bits
2726 const float ch_log = 0x1.62e000p-1f;
2727 const float ct_log = 0x1.0bfbe8p-15f;
2728
2729 SDValue CH = DAG.getConstantFP(IsLog10 ? ch_log10 : ch_log, DL, VT);
2730 SDValue CT = DAG.getConstantFP(IsLog10 ? ct_log10 : ct_log, DL, VT);
2731
2732 SDValue YAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Y);
2733 SDValue MaskConst = DAG.getConstant(0xfffff000, DL, MVT::i32);
2734 SDValue YHInt = DAG.getNode(ISD::AND, DL, MVT::i32, YAsInt, MaskConst);
2735 SDValue YH = DAG.getNode(ISD::BITCAST, DL, MVT::f32, YHInt);
2736 SDValue YT = DAG.getNode(ISD::FSUB, DL, VT, Y, YH, Flags);
2737
2738 SDValue YTCT = DAG.getNode(ISD::FMUL, DL, VT, YT, CT, Flags);
2739 SDValue Mad0 = getMad(DAG, DL, VT, YH, CT, YTCT, Flags);
2740 SDValue Mad1 = getMad(DAG, DL, VT, YT, CH, Mad0, Flags);
2741 R = getMad(DAG, DL, VT, YH, CH, Mad1);
2742 }
2743
2744 const bool IsFiniteOnly = (Flags.hasNoNaNs() || Options.NoNaNsFPMath) &&
2745 (Flags.hasNoInfs() || Options.NoInfsFPMath);
2746
2747 // TODO: Check if known finite from source value.
2748 if (!IsFiniteOnly) {
2749 SDValue IsFinite = getIsFinite(DAG, Y, Flags);
2750 R = DAG.getNode(ISD::SELECT, DL, VT, IsFinite, R, Y, Flags);
2751 }
2752
2753 if (IsScaled) {
2754 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
2755 SDValue ShiftK =
2756 DAG.getConstantFP(IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f, DL, VT);
2757 SDValue Shift =
2758 DAG.getNode(ISD::SELECT, DL, VT, IsScaled, ShiftK, Zero, Flags);
2759 R = DAG.getNode(ISD::FSUB, DL, VT, R, Shift, Flags);
2760 }
2761
2762 return R;
2763}
2764
2766 return LowerFLOGCommon(Op, DAG);
2767}
2768
2769// Do f32 fast math expansion for flog2 or flog10. This is accurate enough for a
2770// promote f16 operation.
2772 SelectionDAG &DAG, bool IsLog10,
2773 SDNodeFlags Flags) const {
2774 EVT VT = Src.getValueType();
2775 unsigned LogOp =
2776 VT == MVT::f32 ? (unsigned)AMDGPUISD::LOG : (unsigned)ISD::FLOG2;
2777
2778 double Log2BaseInverted =
2780
2781 if (VT == MVT::f32) {
2782 auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, SL, Src, Flags);
2783 if (ScaledInput) {
2784 SDValue LogSrc = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags);
2785 SDValue ScaledResultOffset =
2786 DAG.getConstantFP(-32.0 * Log2BaseInverted, SL, VT);
2787
2788 SDValue Zero = DAG.getConstantFP(0.0f, SL, VT);
2789
2790 SDValue ResultOffset = DAG.getNode(ISD::SELECT, SL, VT, IsScaled,
2791 ScaledResultOffset, Zero, Flags);
2792
2793 SDValue Log2Inv = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2794
2795 if (Subtarget->hasFastFMAF32())
2796 return DAG.getNode(ISD::FMA, SL, VT, LogSrc, Log2Inv, ResultOffset,
2797 Flags);
2798 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, LogSrc, Log2Inv, Flags);
2799 return DAG.getNode(ISD::FADD, SL, VT, Mul, ResultOffset);
2800 }
2801 }
2802
2803 SDValue Log2Operand = DAG.getNode(LogOp, SL, VT, Src, Flags);
2804 SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2805
2806 return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand,
2807 Flags);
2808}
2809
2811 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
2812 // If we have to handle denormals, scale up the input and adjust the result.
2813
2814 SDLoc SL(Op);
2815 EVT VT = Op.getValueType();
2816 SDValue Src = Op.getOperand(0);
2817 SDNodeFlags Flags = Op->getFlags();
2818
2819 if (VT == MVT::f16) {
2820 // Nothing in half is a denormal when promoted to f32.
2821 assert(!Subtarget->has16BitInsts());
2822 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);
2823 SDValue Log = DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Ext, Flags);
2824 return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,
2825 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
2826 }
2827
2828 assert(VT == MVT::f32);
2829
2830 if (!needsDenormHandlingF32(DAG, Src, Flags))
2831 return DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Src, Flags);
2832
2833 // bool needs_scaling = x < -0x1.f80000p+6f;
2834 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
2835
2836 // -nextafter(128.0, -1)
2837 SDValue RangeCheckConst = DAG.getConstantFP(-0x1.f80000p+6f, SL, VT);
2838
2839 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2840
2841 SDValue NeedsScaling =
2842 DAG.getSetCC(SL, SetCCVT, Src, RangeCheckConst, ISD::SETOLT);
2843
2844 SDValue SixtyFour = DAG.getConstantFP(0x1.0p+6f, SL, VT);
2845 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2846
2847 SDValue AddOffset =
2848 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, SixtyFour, Zero);
2849
2850 SDValue AddInput = DAG.getNode(ISD::FADD, SL, VT, Src, AddOffset, Flags);
2851 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, AddInput, Flags);
2852
2853 SDValue TwoExpNeg64 = DAG.getConstantFP(0x1.0p-64f, SL, VT);
2854 SDValue One = DAG.getConstantFP(1.0, SL, VT);
2855 SDValue ResultScale =
2856 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, TwoExpNeg64, One);
2857
2858 return DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScale, Flags);
2859}
2860
2862 SelectionDAG &DAG,
2863 SDNodeFlags Flags) const {
2864 EVT VT = X.getValueType();
2865 const SDValue Log2E = DAG.getConstantFP(numbers::log2e, SL, VT);
2866
2867 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) {
2868 // exp2(M_LOG2E_F * f);
2869 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Log2E, Flags);
2870 return DAG.getNode(VT == MVT::f32 ? (unsigned)AMDGPUISD::EXP
2871 : (unsigned)ISD::FEXP2,
2872 SL, VT, Mul, Flags);
2873 }
2874
2875 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2876
2877 SDValue Threshold = DAG.getConstantFP(-0x1.5d58a0p+6f, SL, VT);
2878 SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT);
2879
2880 SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+6f, SL, VT);
2881
2882 SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags);
2883
2884 SDValue AdjustedX =
2885 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X);
2886
2887 SDValue ExpInput = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, Log2E, Flags);
2888
2889 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, ExpInput, Flags);
2890
2891 SDValue ResultScaleFactor = DAG.getConstantFP(0x1.969d48p-93f, SL, VT);
2892 SDValue AdjustedResult =
2893 DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScaleFactor, Flags);
2894
2895 return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, Exp2,
2896 Flags);
2897}
2898
2899/// Emit approx-funcs appropriate lowering for exp10. inf/nan should still be
2900/// handled correctly.
2902 SelectionDAG &DAG,
2903 SDNodeFlags Flags) const {
2904 const EVT VT = X.getValueType();
2905 const unsigned Exp2Op = VT == MVT::f32 ? AMDGPUISD::EXP : ISD::FEXP2;
2906
2907 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) {
2908 // exp2(x * 0x1.a92000p+1f) * exp2(x * 0x1.4f0978p-11f);
2909 SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT);
2910 SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT);
2911
2912 SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, X, K0, Flags);
2913 SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags);
2914 SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, X, K1, Flags);
2915 SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags);
2916 return DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1);
2917 }
2918
2919 // bool s = x < -0x1.2f7030p+5f;
2920 // x += s ? 0x1.0p+5f : 0.0f;
2921 // exp10 = exp2(x * 0x1.a92000p+1f) *
2922 // exp2(x * 0x1.4f0978p-11f) *
2923 // (s ? 0x1.9f623ep-107f : 1.0f);
2924
2925 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2926
2927 SDValue Threshold = DAG.getConstantFP(-0x1.2f7030p+5f, SL, VT);
2928 SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT);
2929
2930 SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+5f, SL, VT);
2931 SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags);
2932 SDValue AdjustedX =
2933 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X);
2934
2935 SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT);
2936 SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT);
2937
2938 SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K0, Flags);
2939 SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags);
2940 SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K1, Flags);
2941 SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags);
2942
2943 SDValue MulExps = DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1, Flags);
2944
2945 SDValue ResultScaleFactor = DAG.getConstantFP(0x1.9f623ep-107f, SL, VT);
2946 SDValue AdjustedResult =
2947 DAG.getNode(ISD::FMUL, SL, VT, MulExps, ResultScaleFactor, Flags);
2948
2949 return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, MulExps,
2950 Flags);
2951}
2952
2954 EVT VT = Op.getValueType();
2955 SDLoc SL(Op);
2956 SDValue X = Op.getOperand(0);
2957 SDNodeFlags Flags = Op->getFlags();
2958 const bool IsExp10 = Op.getOpcode() == ISD::FEXP10;
2959
2960 if (VT.getScalarType() == MVT::f16) {
2961 // v_exp_f16 (fmul x, log2e)
2962 if (allowApproxFunc(DAG, Flags)) // TODO: Does this really require fast?
2963 return lowerFEXPUnsafe(X, SL, DAG, Flags);
2964
2965 if (VT.isVector())
2966 return SDValue();
2967
2968 // exp(f16 x) ->
2969 // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
2970
2971 // Nothing in half is a denormal when promoted to f32.
2972 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, X, Flags);
2973 SDValue Lowered = lowerFEXPUnsafe(Ext, SL, DAG, Flags);
2974 return DAG.getNode(ISD::FP_ROUND, SL, VT, Lowered,
2975 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
2976 }
2977
2978 assert(VT == MVT::f32);
2979
2980 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
2981 // library behavior. Also, is known-not-daz source sufficient?
2982 if (allowApproxFunc(DAG, Flags)) {
2983 return IsExp10 ? lowerFEXP10Unsafe(X, SL, DAG, Flags)
2984 : lowerFEXPUnsafe(X, SL, DAG, Flags);
2985 }
2986
2987 // Algorithm:
2988 //
2989 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
2990 //
2991 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
2992 // n = 64*m + j, 0 <= j < 64
2993 //
2994 // e^x = 2^((64*m + j + f)/64)
2995 // = (2^m) * (2^(j/64)) * 2^(f/64)
2996 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
2997 //
2998 // f = x*(64/ln(2)) - n
2999 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
3000 //
3001 // e^x = (2^m) * (2^(j/64)) * e^r
3002 //
3003 // (2^(j/64)) is precomputed
3004 //
3005 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3006 // e^r = 1 + q
3007 //
3008 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3009 //
3010 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
3011 SDNodeFlags FlagsNoContract = Flags;
3012 FlagsNoContract.setAllowContract(false);
3013
3014 SDValue PH, PL;
3015 if (Subtarget->hasFastFMAF32()) {
3016 const float c_exp = numbers::log2ef;
3017 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
3018 const float c_exp10 = 0x1.a934f0p+1f;
3019 const float cc_exp10 = 0x1.2f346ep-24f;
3020
3021 SDValue C = DAG.getConstantFP(IsExp10 ? c_exp10 : c_exp, SL, VT);
3022 SDValue CC = DAG.getConstantFP(IsExp10 ? cc_exp10 : cc_exp, SL, VT);
3023
3024 PH = DAG.getNode(ISD::FMUL, SL, VT, X, C, Flags);
3025 SDValue NegPH = DAG.getNode(ISD::FNEG, SL, VT, PH, Flags);
3026 SDValue FMA0 = DAG.getNode(ISD::FMA, SL, VT, X, C, NegPH, Flags);
3027 PL = DAG.getNode(ISD::FMA, SL, VT, X, CC, FMA0, Flags);
3028 } else {
3029 const float ch_exp = 0x1.714000p+0f;
3030 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
3031
3032 const float ch_exp10 = 0x1.a92000p+1f;
3033 const float cl_exp10 = 0x1.4f0978p-11f;
3034
3035 SDValue CH = DAG.getConstantFP(IsExp10 ? ch_exp10 : ch_exp, SL, VT);
3036 SDValue CL = DAG.getConstantFP(IsExp10 ? cl_exp10 : cl_exp, SL, VT);
3037
3038 SDValue XAsInt = DAG.getNode(ISD::BITCAST, SL, MVT::i32, X);
3039 SDValue MaskConst = DAG.getConstant(0xfffff000, SL, MVT::i32);
3040 SDValue XHAsInt = DAG.getNode(ISD::AND, SL, MVT::i32, XAsInt, MaskConst);
3041 SDValue XH = DAG.getNode(ISD::BITCAST, SL, VT, XHAsInt);
3042 SDValue XL = DAG.getNode(ISD::FSUB, SL, VT, X, XH, Flags);
3043
3044 PH = DAG.getNode(ISD::FMUL, SL, VT, XH, CH, Flags);
3045
3046 SDValue XLCL = DAG.getNode(ISD::FMUL, SL, VT, XL, CL, Flags);
3047 SDValue Mad0 = getMad(DAG, SL, VT, XL, CH, XLCL, Flags);
3048 PL = getMad(DAG, SL, VT, XH, CL, Mad0, Flags);
3049 }
3050
3051 SDValue E = DAG.getNode(ISD::FROUNDEVEN, SL, VT, PH, Flags);
3052
3053 // It is unsafe to contract this fsub into the PH multiply.
3054 SDValue PHSubE = DAG.getNode(ISD::FSUB, SL, VT, PH, E, FlagsNoContract);
3055
3056 SDValue A = DAG.getNode(ISD::FADD, SL, VT, PHSubE, PL, Flags);
3057 SDValue IntE = DAG.getNode(ISD::FP_TO_SINT, SL, MVT::i32, E);
3058 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, A, Flags);
3059
3060 SDValue R = DAG.getNode(ISD::FLDEXP, SL, VT, Exp2, IntE, Flags);
3061
3062 SDValue UnderflowCheckConst =
3063 DAG.getConstantFP(IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f, SL, VT);
3064
3065 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
3066 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
3067 SDValue Underflow =
3068 DAG.getSetCC(SL, SetCCVT, X, UnderflowCheckConst, ISD::SETOLT);
3069
3070 R = DAG.getNode(ISD::SELECT, SL, VT, Underflow, Zero, R);
3071 const auto &Options = getTargetMachine().Options;
3072
3073 if (!Flags.hasNoInfs() && !Options.NoInfsFPMath) {
3074 SDValue OverflowCheckConst =
3075 DAG.getConstantFP(IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f, SL, VT);
3076 SDValue Overflow =
3077 DAG.getSetCC(SL, SetCCVT, X, OverflowCheckConst, ISD::SETOGT);
3078 SDValue Inf =
3080 R = DAG.getNode(ISD::SELECT, SL, VT, Overflow, Inf, R);
3081 }
3082
3083 return R;
3084}
3085
3086static bool isCtlzOpc(unsigned Opc) {
3087 return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
3088}
3089
3090static bool isCttzOpc(unsigned Opc) {
3091 return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF;
3092}
3093
3095 SelectionDAG &DAG) const {
3096 auto SL = SDLoc(Op);
3097 auto Arg = Op.getOperand(0u);
3098 auto ResultVT = Op.getValueType();
3099
3100 if (ResultVT != MVT::i8 && ResultVT != MVT::i16)
3101 return {};
3102
3103 assert(isCtlzOpc(Op.getOpcode()));
3104 assert(ResultVT == Arg.getValueType());
3105
3106 auto const LeadingZeroes = 32u - ResultVT.getFixedSizeInBits();
3107 auto SubVal = DAG.getConstant(LeadingZeroes, SL, MVT::i32);
3108 auto NewOp = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Arg);
3109 NewOp = DAG.getNode(Op.getOpcode(), SL, MVT::i32, NewOp);
3110 NewOp = DAG.getNode(ISD::SUB, SL, MVT::i32, NewOp, SubVal);
3111 return DAG.getNode(ISD::TRUNCATE, SL, ResultVT, NewOp);
3112}
3113
3115 SDLoc SL(Op);
3116 SDValue Src = Op.getOperand(0);
3117
3118 assert(isCtlzOpc(Op.getOpcode()) || isCttzOpc(Op.getOpcode()));
3119 bool Ctlz = isCtlzOpc(Op.getOpcode());
3120 unsigned NewOpc = Ctlz ? AMDGPUISD::FFBH_U32 : AMDGPUISD::FFBL_B32;
3121
3122 bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF ||
3123 Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF;
3124 bool Is64BitScalar = !Src->isDivergent() && Src.getValueType() == MVT::i64;
3125
3126 if (Src.getValueType() == MVT::i32 || Is64BitScalar) {
3127 // (ctlz hi:lo) -> (umin (ffbh src), 32)
3128 // (cttz hi:lo) -> (umin (ffbl src), 32)
3129 // (ctlz_zero_undef src) -> (ffbh src)
3130 // (cttz_zero_undef src) -> (ffbl src)
3131
3132 // 64-bit scalar version produce 32-bit result
3133 // (ctlz hi:lo) -> (umin (S_FLBIT_I32_B64 src), 64)
3134 // (cttz hi:lo) -> (umin (S_FF1_I32_B64 src), 64)
3135 // (ctlz_zero_undef src) -> (S_FLBIT_I32_B64 src)
3136 // (cttz_zero_undef src) -> (S_FF1_I32_B64 src)
3137 SDValue NewOpr = DAG.getNode(NewOpc, SL, MVT::i32, Src);
3138 if (!ZeroUndef) {
3139 const SDValue ConstVal = DAG.getConstant(
3140 Op.getValueType().getScalarSizeInBits(), SL, MVT::i32);
3141 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, ConstVal);
3142 }
3143 return DAG.getNode(ISD::ZERO_EXTEND, SL, Src.getValueType(), NewOpr);
3144 }
3145
3146 SDValue Lo, Hi;
3147 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3148
3149 SDValue OprLo = DAG.getNode(NewOpc, SL, MVT::i32, Lo);
3150 SDValue OprHi = DAG.getNode(NewOpc, SL, MVT::i32, Hi);
3151
3152 // (ctlz hi:lo) -> (umin3 (ffbh hi), (uaddsat (ffbh lo), 32), 64)
3153 // (cttz hi:lo) -> (umin3 (uaddsat (ffbl hi), 32), (ffbl lo), 64)
3154 // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
3155 // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
3156
3157 unsigned AddOpc = ZeroUndef ? ISD::ADD : ISD::UADDSAT;
3158 const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32);
3159 if (Ctlz)
3160 OprLo = DAG.getNode(AddOpc, SL, MVT::i32, OprLo, Const32);
3161 else
3162 OprHi = DAG.getNode(AddOpc, SL, MVT::i32, OprHi, Const32);
3163
3164 SDValue NewOpr;
3165 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, OprLo, OprHi);
3166 if (!ZeroUndef) {
3167 const SDValue Const64 = DAG.getConstant(64, SL, MVT::i32);
3168 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const64);
3169 }
3170
3171 return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr);
3172}
3173
3175 bool Signed) const {
3176 // The regular method converting a 64-bit integer to float roughly consists of
3177 // 2 steps: normalization and rounding. In fact, after normalization, the
3178 // conversion from a 64-bit integer to a float is essentially the same as the
3179 // one from a 32-bit integer. The only difference is that it has more
3180 // trailing bits to be rounded. To leverage the native 32-bit conversion, a
3181 // 64-bit integer could be preprocessed and fit into a 32-bit integer then
3182 // converted into the correct float number. The basic steps for the unsigned
3183 // conversion are illustrated in the following pseudo code:
3184 //
3185 // f32 uitofp(i64 u) {
3186 // i32 hi, lo = split(u);
3187 // // Only count the leading zeros in hi as we have native support of the
3188 // // conversion from i32 to f32. If hi is all 0s, the conversion is
3189 // // reduced to a 32-bit one automatically.
3190 // i32 shamt = clz(hi); // Return 32 if hi is all 0s.
3191 // u <<= shamt;
3192 // hi, lo = split(u);
3193 // hi |= (lo != 0) ? 1 : 0; // Adjust rounding bit in hi based on lo.
3194 // // convert it as a 32-bit integer and scale the result back.
3195 // return uitofp(hi) * 2^(32 - shamt);
3196 // }
3197 //
3198 // The signed one follows the same principle but uses 'ffbh_i32' to count its
3199 // sign bits instead. If 'ffbh_i32' is not available, its absolute value is
3200 // converted instead followed by negation based its sign bit.
3201
3202 SDLoc SL(Op);
3203 SDValue Src = Op.getOperand(0);
3204
3205 SDValue Lo, Hi;
3206 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3207 SDValue Sign;
3208 SDValue ShAmt;
3209 if (Signed && Subtarget->isGCN()) {
3210 // We also need to consider the sign bit in Lo if Hi has just sign bits,
3211 // i.e. Hi is 0 or -1. However, that only needs to take the MSB into
3212 // account. That is, the maximal shift is
3213 // - 32 if Lo and Hi have opposite signs;
3214 // - 33 if Lo and Hi have the same sign.
3215 //
3216 // Or, MaxShAmt = 33 + OppositeSign, where
3217 //
3218 // OppositeSign is defined as ((Lo ^ Hi) >> 31), which is
3219 // - -1 if Lo and Hi have opposite signs; and
3220 // - 0 otherwise.
3221 //
3222 // All in all, ShAmt is calculated as
3223 //
3224 // umin(sffbh(Hi), 33 + (Lo^Hi)>>31) - 1.
3225 //
3226 // or
3227 //
3228 // umin(sffbh(Hi) - 1, 32 + (Lo^Hi)>>31).
3229 //
3230 // to reduce the critical path.
3231 SDValue OppositeSign = DAG.getNode(
3232 ISD::SRA, SL, MVT::i32, DAG.getNode(ISD::XOR, SL, MVT::i32, Lo, Hi),
3233 DAG.getConstant(31, SL, MVT::i32));
3234 SDValue MaxShAmt =
3235 DAG.getNode(ISD::ADD, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
3236 OppositeSign);
3237 // Count the leading sign bits.
3238 ShAmt = DAG.getNode(AMDGPUISD::FFBH_I32, SL, MVT::i32, Hi);
3239 // Different from unsigned conversion, the shift should be one bit less to
3240 // preserve the sign bit.
3241 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, ShAmt,
3242 DAG.getConstant(1, SL, MVT::i32));
3243 ShAmt = DAG.getNode(ISD::UMIN, SL, MVT::i32, ShAmt, MaxShAmt);
3244 } else {
3245 if (Signed) {
3246 // Without 'ffbh_i32', only leading zeros could be counted. Take the
3247 // absolute value first.
3248 Sign = DAG.getNode(ISD::SRA, SL, MVT::i64, Src,
3249 DAG.getConstant(63, SL, MVT::i64));
3250 SDValue Abs =
3251 DAG.getNode(ISD::XOR, SL, MVT::i64,
3252 DAG.getNode(ISD::ADD, SL, MVT::i64, Src, Sign), Sign);
3253 std::tie(Lo, Hi) = split64BitValue(Abs, DAG);
3254 }
3255 // Count the leading zeros.
3256 ShAmt = DAG.getNode(ISD::CTLZ, SL, MVT::i32, Hi);
3257 // The shift amount for signed integers is [0, 32].
3258 }
3259 // Normalize the given 64-bit integer.
3260 SDValue Norm = DAG.getNode(ISD::SHL, SL, MVT::i64, Src, ShAmt);
3261 // Split it again.
3262 std::tie(Lo, Hi) = split64BitValue(Norm, DAG);
3263 // Calculate the adjust bit for rounding.
3264 // (lo != 0) ? 1 : 0 => (lo >= 1) ? 1 : 0 => umin(1, lo)
3265 SDValue Adjust = DAG.getNode(ISD::UMIN, SL, MVT::i32,
3266 DAG.getConstant(1, SL, MVT::i32), Lo);
3267 // Get the 32-bit normalized integer.
3268 Norm = DAG.getNode(ISD::OR, SL, MVT::i32, Hi, Adjust);
3269 // Convert the normalized 32-bit integer into f32.
3270 unsigned Opc =
3271 (Signed && Subtarget->isGCN()) ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
3272 SDValue FVal = DAG.getNode(Opc, SL, MVT::f32, Norm);
3273
3274 // Finally, need to scale back the converted floating number as the original
3275 // 64-bit integer is converted as a 32-bit one.
3276 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
3277 ShAmt);
3278 // On GCN, use LDEXP directly.
3279 if (Subtarget->isGCN())
3280 return DAG.getNode(ISD::FLDEXP, SL, MVT::f32, FVal, ShAmt);
3281
3282 // Otherwise, align 'ShAmt' to the exponent part and add it into the exponent
3283 // part directly to emulate the multiplication of 2^ShAmt. That 8-bit
3284 // exponent is enough to avoid overflowing into the sign bit.
3285 SDValue Exp = DAG.getNode(ISD::SHL, SL, MVT::i32, ShAmt,
3286 DAG.getConstant(23, SL, MVT::i32));
3287 SDValue IVal =
3288 DAG.getNode(ISD::ADD, SL, MVT::i32,
3289 DAG.getNode(ISD::BITCAST, SL, MVT::i32, FVal), Exp);
3290 if (Signed) {
3291 // Set the sign bit.
3292 Sign = DAG.getNode(ISD::SHL, SL, MVT::i32,
3293 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Sign),
3294 DAG.getConstant(31, SL, MVT::i32));
3295 IVal = DAG.getNode(ISD::OR, SL, MVT::i32, IVal, Sign);
3296 }
3297 return DAG.getNode(ISD::BITCAST, SL, MVT::f32, IVal);
3298}
3299
3301 bool Signed) const {
3302 SDLoc SL(Op);
3303 SDValue Src = Op.getOperand(0);
3304
3305 SDValue Lo, Hi;
3306 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3307
3309 SL, MVT::f64, Hi);
3310
3311 SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
3312
3313 SDValue LdExp = DAG.getNode(ISD::FLDEXP, SL, MVT::f64, CvtHi,
3314 DAG.getConstant(32, SL, MVT::i32));
3315 // TODO: Should this propagate fast-math-flags?
3316 return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
3317}
3318
3320 SelectionDAG &DAG) const {
3321 // TODO: Factor out code common with LowerSINT_TO_FP.
3322 EVT DestVT = Op.getValueType();
3323 SDValue Src = Op.getOperand(0);
3324 EVT SrcVT = Src.getValueType();
3325
3326 if (SrcVT == MVT::i16) {
3327 if (DestVT == MVT::f16)
3328 return Op;
3329 SDLoc DL(Op);
3330
3331 // Promote src to i32
3332 SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Src);
3333 return DAG.getNode(ISD::UINT_TO_FP, DL, DestVT, Ext);
3334 }
3335
3336 if (DestVT == MVT::bf16) {
3337 SDLoc SL(Op);
3338 SDValue ToF32 = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f32, Src);
3339 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true);
3340 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag);
3341 }
3342
3343 if (SrcVT != MVT::i64)
3344 return Op;
3345
3346 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
3347 SDLoc DL(Op);
3348
3349 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
3350 SDValue FPRoundFlag =
3351 DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);
3352 SDValue FPRound =
3353 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
3354
3355 return FPRound;
3356 }
3357
3358 if (DestVT == MVT::f32)
3359 return LowerINT_TO_FP32(Op, DAG, false);
3360
3361 assert(DestVT == MVT::f64);
3362 return LowerINT_TO_FP64(Op, DAG, false);
3363}
3364
3366 SelectionDAG &DAG) const {
3367 EVT DestVT = Op.getValueType();
3368
3369 SDValue Src = Op.getOperand(0);
3370 EVT SrcVT = Src.getValueType();
3371
3372 if (SrcVT == MVT::i16) {
3373 if (DestVT == MVT::f16)
3374 return Op;
3375
3376 SDLoc DL(Op);
3377 // Promote src to i32
3378 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, Src);
3379 return DAG.getNode(ISD::SINT_TO_FP, DL, DestVT, Ext);
3380 }
3381
3382 if (DestVT == MVT::bf16) {
3383 SDLoc SL(Op);
3384 SDValue ToF32 = DAG.getNode(ISD::SINT_TO_FP, SL, MVT::f32, Src);
3385 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true);
3386 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag);
3387 }
3388
3389 if (SrcVT != MVT::i64)
3390 return Op;
3391
3392 // TODO: Factor out code common with LowerUINT_TO_FP.
3393
3394 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
3395 SDLoc DL(Op);
3396 SDValue Src = Op.getOperand(0);
3397
3398 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
3399 SDValue FPRoundFlag =
3400 DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);
3401 SDValue FPRound =
3402 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
3403
3404 return FPRound;
3405 }
3406
3407 if (DestVT == MVT::f32)
3408 return LowerINT_TO_FP32(Op, DAG, true);
3409
3410 assert(DestVT == MVT::f64);
3411 return LowerINT_TO_FP64(Op, DAG, true);
3412}
3413
3415 bool Signed) const {
3416 SDLoc SL(Op);
3417
3418 SDValue Src = Op.getOperand(0);
3419 EVT SrcVT = Src.getValueType();
3420
3421 assert(SrcVT == MVT::f32 || SrcVT == MVT::f64);
3422
3423 // The basic idea of converting a floating point number into a pair of 32-bit
3424 // integers is illustrated as follows:
3425 //
3426 // tf := trunc(val);
3427 // hif := floor(tf * 2^-32);
3428 // lof := tf - hif * 2^32; // lof is always positive due to floor.
3429 // hi := fptoi(hif);
3430 // lo := fptoi(lof);
3431 //
3432 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, SrcVT, Src);
3433 SDValue Sign;
3434 if (Signed && SrcVT == MVT::f32) {
3435 // However, a 32-bit floating point number has only 23 bits mantissa and
3436 // it's not enough to hold all the significant bits of `lof` if val is
3437 // negative. To avoid the loss of precision, We need to take the absolute
3438 // value after truncating and flip the result back based on the original
3439 // signedness.
3440 Sign = DAG.getNode(ISD::SRA, SL, MVT::i32,
3441 DAG.getNode(ISD::BITCAST, SL, MVT::i32, Trunc),
3442 DAG.getConstant(31, SL, MVT::i32));
3443 Trunc = DAG.getNode(ISD::FABS, SL, SrcVT, Trunc);
3444 }
3445
3446 SDValue K0, K1;
3447 if (SrcVT == MVT::f64) {
3448 K0 = DAG.getConstantFP(
3449 llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)), SL,
3450 SrcVT);
3451 K1 = DAG.getConstantFP(
3452 llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)), SL,
3453 SrcVT);
3454 } else {
3455 K0 = DAG.getConstantFP(
3456 llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)), SL, SrcVT);
3457 K1 = DAG.getConstantFP(
3458 llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)), SL, SrcVT);
3459 }
3460 // TODO: Should this propagate fast-math-flags?
3461 SDValue Mul = DAG.getNode(ISD::FMUL, SL, SrcVT, Trunc, K0);
3462
3463 SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, SrcVT, Mul);
3464
3465 SDValue Fma = DAG.getNode(ISD::FMA, SL, SrcVT, FloorMul, K1, Trunc);
3466
3467 SDValue Hi = DAG.getNode((Signed && SrcVT == MVT::f64) ? ISD::FP_TO_SINT
3469 SL, MVT::i32, FloorMul);
3470 SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
3471
3472 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
3473 DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi}));
3474
3475 if (Signed && SrcVT == MVT::f32) {
3476 assert(Sign);
3477 // Flip the result based on the signedness, which is either all 0s or 1s.
3478 Sign = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
3479 DAG.getBuildVector(MVT::v2i32, SL, {Sign, Sign}));
3480 // r := xor(r, sign) - sign;
3481 Result =
3482 DAG.getNode(ISD::SUB, SL, MVT::i64,
3483 DAG.getNode(ISD::XOR, SL, MVT::i64, Result, Sign), Sign);
3484 }
3485
3486 return Result;
3487}
3488
3490 SDLoc DL(Op);
3491 SDValue N0 = Op.getOperand(0);
3492
3493 // Convert to target node to get known bits
3494 if (N0.getValueType() == MVT::f32)
3495 return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0);
3496
3497 if (getTargetMachine().Options.UnsafeFPMath) {
3498 // There is a generic expand for FP_TO_FP16 with unsafe fast math.
3499 return SDValue();
3500 }
3501
3502 assert(N0.getSimpleValueType() == MVT::f64);
3503
3504 // f64 -> f16 conversion using round-to-nearest-even rounding mode.
3505 const unsigned ExpMask = 0x7ff;
3506 const unsigned ExpBiasf64 = 1023;
3507 const unsigned ExpBiasf16 = 15;
3508 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
3509 SDValue One = DAG.getConstant(1, DL, MVT::i32);
3510 SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, N0);
3511 SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U,
3512 DAG.getConstant(32, DL, MVT::i64));
3513 UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32);
3514 U = DAG.getZExtOrTrunc(U, DL, MVT::i32);
3515 SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3516 DAG.getConstant(20, DL, MVT::i64));
3517 E = DAG.getNode(ISD::AND, DL, MVT::i32, E,
3518 DAG.getConstant(ExpMask, DL, MVT::i32));
3519 // Subtract the fp64 exponent bias (1023) to get the real exponent and
3520 // add the f16 bias (15) to get the biased exponent for the f16 format.
3521 E = DAG.getNode(ISD::ADD, DL, MVT::i32, E,
3522 DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32));
3523
3524 SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3525 DAG.getConstant(8, DL, MVT::i32));
3526 M = DAG.getNode(ISD::AND, DL, MVT::i32, M,
3527 DAG.getConstant(0xffe, DL, MVT::i32));
3528
3529 SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH,
3530 DAG.getConstant(0x1ff, DL, MVT::i32));
3531 MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U);
3532
3533 SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ);
3534 M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set);
3535
3536 // (M != 0 ? 0x0200 : 0) | 0x7c00;
3537 SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32,
3538 DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32),
3539 Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32));
3540
3541 // N = M | (E << 12);
3542 SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M,
3543 DAG.getNode(ISD::SHL, DL, MVT::i32, E,
3544 DAG.getConstant(12, DL, MVT::i32)));
3545
3546 // B = clamp(1-E, 0, 13);
3547 SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32,
3548 One, E);
3549 SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero);
3550 B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B,
3551 DAG.getConstant(13, DL, MVT::i32));
3552
3553 SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M,
3554 DAG.getConstant(0x1000, DL, MVT::i32));
3555
3556 SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B);
3557 SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B);
3558 SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE);
3559 D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1);
3560
3561 SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT);
3562 SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V,
3563 DAG.getConstant(0x7, DL, MVT::i32));
3564 V = DAG.getNode(ISD::SRL, DL, MVT::i32, V,
3565 DAG.getConstant(2, DL, MVT::i32));
3566 SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32),
3567 One, Zero, ISD::SETEQ);
3568 SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32),
3569 One, Zero, ISD::SETGT);
3570 V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1);
3571 V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1);
3572
3573 V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32),
3574 DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT);
3575 V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32),
3576 I, V, ISD::SETEQ);
3577
3578 // Extract the sign bit.
3579 SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3580 DAG.getConstant(16, DL, MVT::i32));
3581 Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign,
3582 DAG.getConstant(0x8000, DL, MVT::i32));
3583
3584 V = DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V);
3585 return DAG.getZExtOrTrunc(V, DL, Op.getValueType());
3586}
3587
3589 SelectionDAG &DAG) const {
3590 SDValue Src = Op.getOperand(0);
3591 unsigned OpOpcode = Op.getOpcode();
3592 EVT SrcVT = Src.getValueType();
3593 EVT DestVT = Op.getValueType();
3594
3595 // Will be selected natively
3596 if (SrcVT == MVT::f16 && DestVT == MVT::i16)
3597 return Op;
3598
3599 if (SrcVT == MVT::bf16) {
3600 SDLoc DL(Op);
3601 SDValue PromotedSrc = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
3602 return DAG.getNode(Op.getOpcode(), DL, DestVT, PromotedSrc);
3603 }
3604
3605 // Promote i16 to i32
3606 if (DestVT == MVT::i16 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
3607 SDLoc DL(Op);
3608
3609 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
3610 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToInt32);
3611 }
3612
3613 if (DestVT != MVT::i64)
3614 return Op;
3615
3616 if (SrcVT == MVT::f16 ||
3617 (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {
3618 SDLoc DL(Op);
3619
3620 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
3621 unsigned Ext =
3623 return DAG.getNode(Ext, DL, MVT::i64, FpToInt32);
3624 }
3625
3626 if (SrcVT == MVT::f32 || SrcVT == MVT::f64)
3627 return LowerFP_TO_INT64(Op, DAG, OpOpcode == ISD::FP_TO_SINT);
3628
3629 return SDValue();
3630}
3631
3633 SelectionDAG &DAG) const {
3634 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
3635 MVT VT = Op.getSimpleValueType();
3636 MVT ScalarVT = VT.getScalarType();
3637
3638 assert(VT.isVector());
3639
3640 SDValue Src = Op.getOperand(0);
3641 SDLoc DL(Op);
3642
3643 // TODO: Don't scalarize on Evergreen?
3644 unsigned NElts = VT.getVectorNumElements();
3646 DAG.ExtractVectorElements(Src, Args, 0, NElts);
3647
3648 SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
3649 for (unsigned I = 0; I < NElts; ++I)
3650 Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
3651
3652 return DAG.getBuildVector(VT, DL, Args);
3653}
3654
3655//===----------------------------------------------------------------------===//
3656// Custom DAG optimizations
3657//===----------------------------------------------------------------------===//
3658
3659static bool isU24(SDValue Op, SelectionDAG &DAG) {
3660 return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24;
3661}
3662
3663static bool isI24(SDValue Op, SelectionDAG &DAG) {
3664 EVT VT = Op.getValueType();
3665 return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
3666 // as unsigned 24-bit values.
3668}
3669
3672 SelectionDAG &DAG = DCI.DAG;
3673 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
3674 bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
3675
3676 SDValue LHS = IsIntrin ? Node24->getOperand(1) : Node24->getOperand(0);
3677 SDValue RHS = IsIntrin ? Node24->getOperand(2) : Node24->getOperand(1);
3678 unsigned NewOpcode = Node24->getOpcode();
3679 if (IsIntrin) {
3680 unsigned IID = Node24->getConstantOperandVal(0);
3681 switch (IID) {
3682 case Intrinsic::amdgcn_mul_i24:
3683 NewOpcode = AMDGPUISD::MUL_I24;
3684 break;
3685 case Intrinsic::amdgcn_mul_u24:
3686 NewOpcode = AMDGPUISD::MUL_U24;
3687 break;
3688 case Intrinsic::amdgcn_mulhi_i24:
3689 NewOpcode = AMDGPUISD::MULHI_I24;
3690 break;
3691 case Intrinsic::amdgcn_mulhi_u24:
3692 NewOpcode = AMDGPUISD::MULHI_U24;
3693 break;
3694 default:
3695 llvm_unreachable("Expected 24-bit mul intrinsic");
3696 }
3697 }
3698
3699 APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24);
3700
3701 // First try to simplify using SimplifyMultipleUseDemandedBits which allows
3702 // the operands to have other uses, but will only perform simplifications that
3703 // involve bypassing some nodes for this user.
3704 SDValue DemandedLHS = TLI.SimplifyMultipleUseDemandedBits(LHS, Demanded, DAG);
3705 SDValue DemandedRHS = TLI.SimplifyMultipleUseDemandedBits(RHS, Demanded, DAG);
3706 if (DemandedLHS || DemandedRHS)
3707 return DAG.getNode(NewOpcode, SDLoc(Node24), Node24->getVTList(),
3708 DemandedLHS ? DemandedLHS : LHS,
3709 DemandedRHS ? DemandedRHS : RHS);
3710
3711 // Now try SimplifyDemandedBits which can simplify the nodes used by our
3712 // operands if this node is the only user.
3713 if (TLI.SimplifyDemandedBits(LHS, Demanded, DCI))
3714 return SDValue(Node24, 0);
3715 if (TLI.SimplifyDemandedBits(RHS, Demanded, DCI))
3716 return SDValue(Node24, 0);
3717
3718 return SDValue();
3719}
3720
3721template <typename IntTy>
3723 uint32_t Width, const SDLoc &DL) {
3724 if (Width + Offset < 32) {
3725 uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
3726 IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
3727 return DAG.getConstant(Result, DL, MVT::i32);
3728 }
3729
3730 return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
3731}
3732
3733static bool hasVolatileUser(SDNode *Val) {
3734 for (SDNode *U : Val->uses()) {
3735 if (MemSDNode *M = dyn_cast<MemSDNode>(U)) {
3736 if (M->isVolatile())
3737 return true;
3738 }
3739 }
3740
3741 return false;
3742}
3743
3745 // i32 vectors are the canonical memory type.
3746 if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
3747 return false;
3748
3749 if (!VT.isByteSized())
3750 return false;
3751
3752 unsigned Size = VT.getStoreSize();
3753
3754 if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
3755 return false;
3756
3757 if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
3758 return false;
3759
3760 return true;
3761}
3762
3763// Replace load of an illegal type with a store of a bitcast to a friendlier
3764// type.
3766 DAGCombinerInfo &DCI) const {
3767 if (!DCI.isBeforeLegalize())
3768 return SDValue();
3769
3770 LoadSDNode *LN = cast<LoadSDNode>(N);
3771 if (!LN->isSimple() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
3772 return SDValue();
3773
3774 SDLoc SL(N);
3775 SelectionDAG &DAG = DCI.DAG;
3776 EVT VT = LN->getMemoryVT();
3777
3778 unsigned Size = VT.getStoreSize();
3779 Align Alignment = LN->getAlign();
3780 if (Alignment < Size && isTypeLegal(VT)) {
3781 unsigned IsFast;
3782 unsigned AS = LN->getAddressSpace();
3783
3784 // Expand unaligned loads earlier than legalization. Due to visitation order
3785 // problems during legalization, the emitted instructions to pack and unpack
3786 // the bytes again are not eliminated in the case of an unaligned copy.
3788 VT, AS, Alignment, LN->getMemOperand()->getFlags(), &IsFast)) {
3789 if (VT.isVector())
3790 return SplitVectorLoad(SDValue(LN, 0), DAG);
3791
3792 SDValue Ops[2];
3793 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
3794
3795 return DAG.getMergeValues(Ops, SDLoc(N));
3796 }
3797
3798 if (!IsFast)
3799 return SDValue();
3800 }
3801
3802 if (!shouldCombineMemoryType(VT))
3803 return SDValue();
3804
3805 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3806
3807 SDValue NewLoad
3808 = DAG.getLoad(NewVT, SL, LN->getChain(),
3809 LN->getBasePtr(), LN->getMemOperand());
3810
3811 SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad);
3812 DCI.CombineTo(N, BC, NewLoad.getValue(1));
3813 return SDValue(N, 0);
3814}
3815
3816// Replace store of an illegal type with a store of a bitcast to a friendlier
3817// type.
3819 DAGCombinerInfo &DCI) const {
3820 if (!DCI.isBeforeLegalize())
3821 return SDValue();
3822
3823 StoreSDNode *SN = cast<StoreSDNode>(N);
3824 if (!SN->isSimple() || !ISD::isNormalStore(SN))
3825 return SDValue();
3826
3827 EVT VT = SN->getMemoryVT();
3828 unsigned Size = VT.getStoreSize();
3829
3830 SDLoc SL(N);
3831 SelectionDAG &DAG = DCI.DAG;
3832 Align Alignment = SN->getAlign();
3833 if (Alignment < Size && isTypeLegal(VT)) {
3834 unsigned IsFast;
3835 unsigned AS = SN->getAddressSpace();
3836
3837 // Expand unaligned stores earlier than legalization. Due to visitation
3838 // order problems during legalization, the emitted instructions to pack and
3839 // unpack the bytes again are not eliminated in the case of an unaligned
3840 // copy.
3842 VT, AS, Alignment, SN->getMemOperand()->getFlags(), &IsFast)) {
3843 if (VT.isVector())
3844 return SplitVectorStore(SDValue(SN, 0), DAG);
3845
3846 return expandUnalignedStore(SN, DAG);
3847 }
3848
3849 if (!IsFast)
3850 return SDValue();
3851 }
3852
3853 if (!shouldCombineMemoryType(VT))
3854 return SDValue();
3855
3856 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3857 SDValue Val = SN->getValue();
3858
3859 //DCI.AddToWorklist(Val.getNode());
3860
3861 bool OtherUses = !Val.hasOneUse();
3862 SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val);
3863 if (OtherUses) {
3864 SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal);
3865 DAG.ReplaceAllUsesOfValueWith(Val, CastBack);
3866 }
3867
3868 return DAG.getStore(SN->getChain(), SL, CastVal,
3869 SN->getBasePtr(), SN->getMemOperand());
3870}
3871
3872// FIXME: This should go in generic DAG combiner with an isTruncateFree check,
3873// but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
3874// issues.
3876 DAGCombinerInfo &DCI) const {
3877 SelectionDAG &DAG = DCI.DAG;
3878 SDValue N0 = N->getOperand(0);
3879
3880 // (vt2 (assertzext (truncate vt0:x), vt1)) ->
3881 // (vt2 (truncate (assertzext vt0:x, vt1)))
3882 if (N0.getOpcode() == ISD::TRUNCATE) {
3883 SDValue N1 = N->getOperand(1);
3884 EVT ExtVT = cast<VTSDNode>(N1)->getVT();
3885 SDLoc SL(N);
3886
3887 SDValue Src = N0.getOperand(0);
3888 EVT SrcVT = Src.getValueType();
3889 if (SrcVT.bitsGE(ExtVT)) {
3890 SDValue NewInReg = DAG.getNode(N->getOpcode(), SL, SrcVT, Src, N1);
3891 return DAG.getNode(ISD::TRUNCATE, SL, N->getValueType(0), NewInReg);
3892 }
3893 }
3894
3895 return SDValue();
3896}
3897
3899 SDNode *N, DAGCombinerInfo &DCI) const {
3900 unsigned IID = N->getConstantOperandVal(0);
3901 switch (IID) {
3902 case Intrinsic::amdgcn_mul_i24:
3903 case Intrinsic::amdgcn_mul_u24:
3904 case Intrinsic::amdgcn_mulhi_i24:
3905 case Intrinsic::amdgcn_mulhi_u24:
3906 return simplifyMul24(N, DCI);
3907 case Intrinsic::amdgcn_fract:
3908 case Intrinsic::amdgcn_rsq:
3909 case Intrinsic::amdgcn_rcp_legacy:
3910 case Intrinsic::amdgcn_rsq_legacy:
3911 case Intrinsic::amdgcn_rsq_clamp: {
3912 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
3913 SDValue Src = N->getOperand(1);
3914 return Src.isUndef() ? Src : SDValue();
3915 }
3916 case Intrinsic::amdgcn_frexp_exp: {
3917 // frexp_exp (fneg x) -> frexp_exp x
3918 // frexp_exp (fabs x) -> frexp_exp x
3919 // frexp_exp (fneg (fabs x)) -> frexp_exp x
3920 SDValue Src = N->getOperand(1);
3921 SDValue PeekSign = peekFPSignOps(Src);
3922 if (PeekSign == Src)
3923 return SDValue();
3924 return SDValue(DCI.DAG.UpdateNodeOperands(N, N->getOperand(0), PeekSign),
3925 0);
3926 }
3927 default:
3928 return SDValue();
3929 }
3930}
3931
3932/// Split the 64-bit value \p LHS into two 32-bit components, and perform the
3933/// binary operation \p Opc to it with the corresponding constant operands.
3935 DAGCombinerInfo &DCI, const SDLoc &SL,
3936 unsigned Opc, SDValue LHS,
3937 uint32_t ValLo, uint32_t ValHi) const {
3938 SelectionDAG &DAG = DCI.DAG;
3939 SDValue Lo, Hi;
3940 std::tie(Lo, Hi) = split64BitValue(LHS, DAG);
3941
3942 SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32);
3943 SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32);
3944
3945 SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS);
3946 SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS);
3947
3948 // Re-visit the ands. It's possible we eliminated one of them and it could
3949 // simplify the vector.
3950 DCI.AddToWorklist(Lo.getNode());
3951 DCI.AddToWorklist(Hi.getNode());
3952
3953 SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd});
3954 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
3955}
3956
3958 DAGCombinerInfo &DCI) const {
3959 EVT VT = N->getValueType(0);
3960
3961 ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3962 if (!RHS)
3963 return SDValue();
3964
3965 SDValue LHS = N->getOperand(0);
3966 unsigned RHSVal = RHS->getZExtValue();
3967 if (!RHSVal)
3968 return LHS;
3969
3970 SDLoc SL(N);
3971 SelectionDAG &DAG = DCI.DAG;
3972
3973 switch (LHS->getOpcode()) {
3974 default:
3975 break;
3976 case ISD::ZERO_EXTEND:
3977 case ISD::SIGN_EXTEND:
3978 case ISD::ANY_EXTEND: {
3979 SDValue X = LHS->getOperand(0);
3980
3981 if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 &&
3982 isOperationLegal(ISD::BUILD_VECTOR, MVT::v2i16)) {
3983 // Prefer build_vector as the canonical form if packed types are legal.
3984 // (shl ([asz]ext i16:x), 16 -> build_vector 0, x
3985 SDValue Vec = DAG.getBuildVector(MVT::v2i16, SL,
3986 { DAG.getConstant(0, SL, MVT::i16), LHS->getOperand(0) });
3987 return DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
3988 }
3989
3990 // shl (ext x) => zext (shl x), if shift does not overflow int
3991 if (VT != MVT::i64)
3992 break;
3993 KnownBits Known = DAG.computeKnownBits(X);
3994 unsigned LZ = Known.countMinLeadingZeros();
3995 if (LZ < RHSVal)
3996 break;
3997 EVT XVT = X.getValueType();
3998 SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0));
3999 return DAG.getZExtOrTrunc(Shl, SL, VT);
4000 }
4001 }
4002
4003 if (VT != MVT::i64)
4004 return SDValue();
4005
4006 // i64 (shl x, C) -> (build_pair 0, (shl x, C -32))
4007
4008 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
4009 // common case, splitting this into a move and a 32-bit shift is faster and
4010 // the same code size.
4011 if (RHSVal < 32)
4012 return SDValue();
4013
4014 SDValue ShiftAmt = DAG.getConstant(RHSVal - 32, SL, MVT::i32);
4015
4016 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
4017 SDValue NewShift = DAG.getNode(ISD::SHL, SL, MVT::i32, Lo, ShiftAmt);
4018
4019 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
4020
4021 SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {Zero, NewShift});
4022 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
4023}
4024
4026 DAGCombinerInfo &DCI) const {
4027 if (N->getValueType(0) != MVT::i64)
4028 return SDValue();
4029
4030 const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
4031 if (!RHS)
4032 return SDValue();
4033
4034 SelectionDAG &DAG = DCI.DAG;
4035 SDLoc SL(N);
4036 unsigned RHSVal = RHS->getZExtValue();
4037
4038 // (sra i64:x, 32) -> build_pair x, (sra hi_32(x), 31)
4039 if (RHSVal == 32) {
4040 SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
4041 SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
4042 DAG.getConstant(31, SL, MVT::i32));
4043
4044 SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {Hi, NewShift});
4045 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
4046 }
4047
4048 // (sra i64:x, 63) -> build_pair (sra hi_32(x), 31), (sra hi_32(x), 31)
4049 if (RHSVal == 63) {
4050 SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
4051 SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
4052 DAG.getConstant(31, SL, MVT::i32));
4053 SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, NewShift});
4054 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
4055 }
4056
4057 return SDValue();
4058}
4059
4061 DAGCombinerInfo &DCI) const {
4062 auto *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
4063 if (!RHS)
4064 return SDValue();
4065
4066 EVT VT = N->getValueType(0);
4067 SDValue LHS = N->getOperand(0);
4068 unsigned ShiftAmt = RHS->getZExtValue();
4069 SelectionDAG &DAG = DCI.DAG;
4070 SDLoc SL(N);
4071
4072 // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1)
4073 // this improves the ability to match BFE patterns in isel.
4074 if (LHS.getOpcode() == ISD::AND) {
4075 if (auto *Mask = dyn_cast<ConstantSDNode>(LHS.getOperand(1))) {
4076 unsigned MaskIdx, MaskLen;
4077 if (Mask->getAPIntValue().isShiftedMask(MaskIdx, MaskLen) &&
4078 MaskIdx == ShiftAmt) {
4079 return DAG.getNode(
4080 ISD::AND, SL, VT,
4081 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0), N->getOperand(1)),
4082 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(1), N->getOperand(1)));
4083 }
4084 }
4085 }
4086
4087 if (VT != MVT::i64)
4088 return SDValue();
4089
4090 if (ShiftAmt < 32)
4091 return SDValue();
4092
4093 // srl i64:x, C for C >= 32
4094 // =>
4095 // build_pair (srl hi_32(x), C - 32), 0
4096 SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
4097
4098 SDValue Hi = getHiHalf64(LHS, DAG);
4099
4100 SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32);
4101 SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst);
4102
4103 SDValue BuildPair = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, Zero});
4104
4105 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair);
4106}
4107
4109 SDNode *N, DAGCombinerInfo &DCI) const {
4110 SDLoc SL(N);
4111 SelectionDAG &DAG = DCI.DAG;
4112 EVT VT = N->getValueType(0);
4113 SDValue Src = N->getOperand(0);
4114
4115 // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x)
4116 if (Src.getOpcode() == ISD::BITCAST && !VT.isVector()) {
4117 SDValue Vec = Src.getOperand(0);
4118 if (Vec.getOpcode() == ISD::BUILD_VECTOR) {
4119 SDValue Elt0 = Vec.getOperand(0);
4120 EVT EltVT = Elt0.getValueType();
4121 if (VT.getFixedSizeInBits() <= EltVT.getFixedSizeInBits()) {
4122 if (EltVT.isFloatingPoint()) {
4123 Elt0 = DAG.getNode(ISD::BITCAST, SL,
4124 EltVT.changeTypeToInteger(), Elt0);
4125 }
4126
4127 return DAG.getNode(ISD::TRUNCATE, SL, VT, Elt0);
4128 }
4129 }
4130 }
4131
4132 // Equivalent of above for accessing the high element of a vector as an
4133 // integer operation.
4134 // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)
4135 if (Src.getOpcode() == ISD::SRL && !VT.isVector()) {
4136 if (auto K = isConstOrConstSplat(Src.getOperand(1))) {
4137 if (2 * K->getZExtValue() == Src.getValueType().getScalarSizeInBits()) {
4138 SDValue BV = stripBitcast(Src.getOperand(0));
4139 if (BV.getOpcode() == ISD::BUILD_VECTOR &&
4140 BV.getValueType().getVectorNumElements() == 2) {
4141 SDValue SrcElt = BV.getOperand(1);
4142 EVT SrcEltVT = SrcElt.getValueType();
4143 if (SrcEltVT.isFloatingPoint()) {
4144 SrcElt = DAG.getNode(ISD::BITCAST, SL,
4145 SrcEltVT.changeTypeToInteger(), SrcElt);
4146 }
4147
4148 return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt);
4149 }
4150 }
4151 }
4152 }
4153
4154 // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.
4155 //
4156 // i16 (trunc (srl i64:x, K)), K <= 16 ->
4157 // i16 (trunc (srl (i32 (trunc x), K)))
4158 if (VT.getScalarSizeInBits() < 32) {
4159 EVT SrcVT = Src.getValueType();
4160 if (SrcVT.getScalarSizeInBits() > 32 &&
4161 (Src.getOpcode() == ISD::SRL ||
4162 Src.getOpcode() == ISD::SRA ||
4163 Src.getOpcode() == ISD::SHL)) {
4164 SDValue Amt = Src.getOperand(1);
4165 KnownBits Known = DAG.computeKnownBits(Amt);
4166
4167 // - For left shifts, do the transform as long as the shift
4168 // amount is still legal for i32, so when ShiftAmt < 32 (<= 31)
4169 // - For right shift, do it if ShiftAmt <= (32 - Size) to avoid
4170 // losing information stored in the high bits when truncating.
4171 const unsigned MaxCstSize =
4172 (Src.getOpcode() == ISD::SHL) ? 31 : (32 - VT.getScalarSizeInBits());
4173 if (Known.getMaxValue().ule(MaxCstSize)) {
4174 EVT MidVT = VT.isVector() ?
4175 EVT::getVectorVT(*DAG.getContext(), MVT::i32,
4176 VT.getVectorNumElements()) : MVT::i32;
4177
4178 EVT NewShiftVT = getShiftAmountTy(MidVT, DAG.getDataLayout());
4179 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MidVT,
4180 Src.getOperand(0));
4181 DCI.AddToWorklist(Trunc.getNode());
4182
4183 if (Amt.getValueType() != NewShiftVT) {
4184 Amt = DAG.getZExtOrTrunc(Amt, SL, NewShiftVT);
4185 DCI.AddToWorklist(Amt.getNode());
4186 }
4187
4188 SDValue ShrunkShift = DAG.getNode(Src.getOpcode(), SL, MidVT,
4189 Trunc, Amt);
4190 return DAG.getNode(ISD::TRUNCATE, SL, VT, ShrunkShift);
4191 }
4192 }
4193 }
4194
4195 return SDValue();
4196}
4197
4198// We need to specifically handle i64 mul here to avoid unnecessary conversion
4199// instructions. If we only match on the legalized i64 mul expansion,
4200// SimplifyDemandedBits will be unable to remove them because there will be
4201// multiple uses due to the separate mul + mulh[su].
4202static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
4203 SDValue N0, SDValue N1, unsigned Size, bool Signed) {
4204 if (Size <= 32) {
4205 unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
4206 return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1);
4207 }
4208
4209 unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
4210 unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
4211
4212 SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1);
4213 SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1);
4214
4215 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, MulLo, MulHi);
4216}
4217
4218/// If \p V is an add of a constant 1, returns the other operand. Otherwise
4219/// return SDValue().
4220static SDValue getAddOneOp(const SDNode *V) {
4221 if (V->getOpcode() != ISD::ADD)
4222 return SDValue();
4223
4224 return isOneConstant(V->getOperand(1)) ? V->getOperand(0) : SDValue();
4225}
4226
4228 DAGCombinerInfo &DCI) const {
4229 assert(N->getOpcode() == ISD::MUL);
4230 EVT VT = N->getValueType(0);
4231
4232 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4233 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4234 // unnecessarily). isDivergent() is used as an approximation of whether the
4235 // value is in an SGPR.
4236 if (!N->isDivergent())
4237 return SDValue();
4238
4239 unsigned Size = VT.getSizeInBits();
4240 if (VT.isVector() || Size > 64)
4241 return SDValue();
4242
4243 SelectionDAG &DAG = DCI.DAG;
4244 SDLoc DL(N);
4245
4246 SDValue N0 = N->getOperand(0);
4247 SDValue N1 = N->getOperand(1);
4248
4249 // Undo InstCombine canonicalize X * (Y + 1) -> X * Y + X to enable mad
4250 // matching.
4251
4252 // mul x, (add y, 1) -> add (mul x, y), x
4253 auto IsFoldableAdd = [](SDValue V) -> SDValue {
4254 SDValue AddOp = getAddOneOp(V.getNode());
4255 if (!AddOp)
4256 return SDValue();
4257
4258 if (V.hasOneUse() || all_of(V->uses(), [](const SDNode *U) -> bool {
4259 return U->getOpcode() == ISD::MUL;
4260 }))
4261 return AddOp;
4262
4263 return SDValue();
4264 };
4265
4266 // FIXME: The selection pattern is not properly checking for commuted
4267 // operands, so we have to place the mul in the LHS
4268 if (SDValue MulOper = IsFoldableAdd(N0)) {
4269 SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N1, MulOper);
4270 return DAG.getNode(ISD::ADD, DL, VT, MulVal, N1);
4271 }
4272
4273 if (SDValue MulOper = IsFoldableAdd(N1)) {
4274 SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N0, MulOper);
4275 return DAG.getNode(ISD::ADD, DL, VT, MulVal, N0);
4276 }
4277
4278 // There are i16 integer mul/mad.
4279 if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16))
4280 return SDValue();
4281
4282 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
4283 // in the source into any_extends if the result of the mul is truncated. Since
4284 // we can assume the high bits are whatever we want, use the underlying value
4285 // to avoid the unknown high bits from interfering.
4286 if (N0.getOpcode() == ISD::ANY_EXTEND)
4287 N0 = N0.getOperand(0);
4288
4289 if (N1.getOpcode() == ISD::ANY_EXTEND)
4290 N1 = N1.getOperand(0);
4291
4292 SDValue Mul;
4293
4294 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
4295 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4296 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4297 Mul = getMul24(DAG, DL, N0, N1, Size, false);
4298 } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
4299 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4300 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4301 Mul = getMul24(DAG, DL, N0, N1, Size, true);
4302 } else {
4303 return SDValue();
4304 }
4305
4306 // We need to use sext even for MUL_U24, because MUL_U24 is used
4307 // for signed multiply of 8 and 16-bit types.
4308 return DAG.getSExtOrTrunc(Mul, DL, VT);
4309}
4310
4311SDValue
4313 DAGCombinerInfo &DCI) const {
4314 if (N->getValueType(0) != MVT::i32)
4315 return SDValue();
4316
4317 SelectionDAG &DAG = DCI.DAG;
4318 SDLoc DL(N);
4319
4320 SDValue N0 = N->getOperand(0);
4321 SDValue N1 = N->getOperand(1);
4322
4323 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
4324 // in the source into any_extends if the result of the mul is truncated. Since
4325 // we can assume the high bits are whatever we want, use the underlying value
4326 // to avoid the unknown high bits from interfering.
4327 if (N0.getOpcode() == ISD::ANY_EXTEND)
4328 N0 = N0.getOperand(0);
4329 if (N1.getOpcode() == ISD::ANY_EXTEND)
4330 N1 = N1.getOperand(0);
4331
4332 // Try to use two fast 24-bit multiplies (one for each half of the result)
4333 // instead of one slow extending multiply.
4334 unsigned LoOpcode, HiOpcode;
4335 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
4336 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4337 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4338 LoOpcode = AMDGPUISD::MUL_U24;
4339 HiOpcode = AMDGPUISD::MULHI_U24;
4340 } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
4341 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4342 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4343 LoOpcode = AMDGPUISD::MUL_I24;
4344 HiOpcode = AMDGPUISD::MULHI_I24;
4345 } else {
4346 return SDValue();
4347 }
4348
4349 SDValue Lo = DAG.getNode(LoOpcode, DL, MVT::i32, N0, N1);
4350 SDValue Hi = DAG.getNode(HiOpcode, DL, MVT::i32, N0, N1);
4351 DCI.CombineTo(N, Lo, Hi);
4352 return SDValue(N, 0);
4353}
4354
4356 DAGCombinerInfo &DCI) const {
4357 EVT VT = N->getValueType(0);
4358
4359 if (!Subtarget->hasMulI24() || VT.isVector())
4360 return SDValue();
4361
4362 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4363 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4364 // unnecessarily). isDivergent() is used as an approximation of whether the
4365 // value is in an SGPR.
4366 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
4367 // valu op anyway)
4368 if (Subtarget->hasSMulHi() && !N->isDivergent())
4369 return SDValue();
4370
4371 SelectionDAG &DAG = DCI.DAG;
4372 SDLoc DL(N);
4373
4374 SDValue N0 = N->getOperand(0);
4375 SDValue N1 = N->getOperand(1);
4376
4377 if (!isI24(N0, DAG) || !isI24(N1, DAG))
4378 return SDValue();
4379
4380 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4381 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4382
4383 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1);
4384 DCI.AddToWorklist(Mulhi.getNode());
4385 return DAG.getSExtOrTrunc(Mulhi, DL, VT);
4386}
4387
4389 DAGCombinerInfo &DCI) const {
4390 EVT VT = N->getValueType(0);
4391
4392 if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32)
4393 return SDValue();
4394
4395 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4396 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4397 // unnecessarily). isDivergent() is used as an approximation of whether the
4398 // value is in an SGPR.
4399 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
4400 // valu op anyway)
4401 if (Subtarget->hasSMulHi() && !N->isDivergent())
4402 return SDValue();
4403
4404 SelectionDAG &DAG = DCI.DAG;
4405 SDLoc DL(N);
4406
4407 SDValue N0 = N->getOperand(0);
4408 SDValue N1 = N->getOperand(1);
4409
4410 if (!isU24(N0, DAG) || !isU24(N1, DAG))
4411 return SDValue();
4412
4413 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4414 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4415
4416 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1);
4417 DCI.AddToWorklist(Mulhi.getNode());
4418 return DAG.getZExtOrTrunc(Mulhi, DL, VT);
4419}
4420
4421SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG,
4422 SDValue Op,
4423 const SDLoc &DL,
4424 unsigned Opc) const {
4425 EVT VT = Op.getValueType();
4426 EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT);
4427 if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() &&
4428 LegalVT != MVT::i16))
4429 return SDValue();
4430
4431 if (VT != MVT::i32)
4432 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op);
4433
4434 SDValue FFBX = DAG.getNode(Opc, DL, MVT::i32, Op);
4435 if (VT != MVT::i32)
4436 FFBX = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBX);
4437
4438 return FFBX;
4439}
4440
4441// The native instructions return -1 on 0 input. Optimize out a select that
4442// produces -1 on 0.
4443//
4444// TODO: If zero is not undef, we could also do this if the output is compared
4445// against the bitwidth.
4446//
4447// TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
4449 SDValue LHS, SDValue RHS,
4450 DAGCombinerInfo &DCI) const {
4451 if (!isNullConstant(Cond.getOperand(1)))
4452 return SDValue();
4453
4454 SelectionDAG &DAG = DCI.DAG;
4455 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
4456 SDValue CmpLHS = Cond.getOperand(0);
4457
4458 // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
4459 // select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x
4460 if (CCOpcode == ISD::SETEQ &&
4461 (isCtlzOpc(RHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&
4462 RHS.getOperand(0) == CmpLHS && isAllOnesConstant(LHS)) {
4463 unsigned Opc =
4465 return getFFBX_U32(DAG, CmpLHS, SL, Opc);
4466 }
4467
4468 // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
4469 // select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x
4470 if (CCOpcode == ISD::SETNE &&
4471 (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(LHS.getOpcode())) &&
4472 LHS.getOperand(0) == CmpLHS && isAllOnesConstant(RHS)) {
4473 unsigned Opc =
4475
4476 return getFFBX_U32(DAG, CmpLHS, SL, Opc);
4477 }
4478
4479 return SDValue();
4480}
4481
4483 unsigned Op,
4484 const SDLoc &SL,
4485 SDValue Cond,
4486 SDValue N1,
4487 SDValue N2) {
4488 SelectionDAG &DAG = DCI.DAG;
4489 EVT VT = N1.getValueType();
4490
4491 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond,
4492 N1.getOperand(0), N2.getOperand(0));
4493 DCI.AddToWorklist(NewSelect.getNode());
4494 return DAG.getNode(Op, SL, VT, NewSelect);
4495}
4496
4497// Pull a free FP operation out of a select so it may fold into uses.
4498//
4499// select c, (fneg x), (fneg y) -> fneg (select c, x, y)
4500// select c, (fneg x), k -> fneg (select c, x, (fneg k))
4501//
4502// select c, (fabs x), (fabs y) -> fabs (select c, x, y)
4503// select c, (fabs x), +k -> fabs (select c, x, k)
4504SDValue
4506 SDValue N) const {
4507 SelectionDAG &DAG = DCI.DAG;
4508 SDValue Cond = N.getOperand(0);
4509 SDValue LHS = N.getOperand(1);
4510 SDValue RHS = N.getOperand(2);
4511
4512 EVT VT = N.getValueType();
4513 if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||
4514 (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
4516 return SDValue();
4517
4518 return distributeOpThroughSelect(DCI, LHS.getOpcode(),
4519 SDLoc(N), Cond, LHS, RHS);
4520 }
4521
4522 bool Inv = false;
4523 if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) {
4524 std::swap(LHS, RHS);
4525 Inv = true;
4526 }
4527
4528 // TODO: Support vector constants.
4529 ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
4530 if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS &&
4531 !selectSupportsSourceMods(N.getNode())) {
4532 SDLoc SL(N);
4533 // If one side is an fneg/fabs and the other is a constant, we can push the
4534 // fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
4535 SDValue NewLHS = LHS.getOperand(0);
4536 SDValue NewRHS = RHS;
4537
4538 // Careful: if the neg can be folded up, don't try to pull it back down.
4539 bool ShouldFoldNeg = true;
4540
4541 if (NewLHS.hasOneUse()) {
4542 unsigned Opc = NewLHS.getOpcode();
4543 if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(NewLHS.getNode()))
4544 ShouldFoldNeg = false;
4545 if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
4546 ShouldFoldNeg = false;
4547 }
4548
4549 if (ShouldFoldNeg) {
4550 if (LHS.getOpcode() == ISD::FABS && CRHS->isNegative())
4551 return SDValue();
4552
4553 // We're going to be forced to use a source modifier anyway, there's no
4554 // point to pulling the negate out unless we can get a size reduction by
4555 // negating the constant.
4556 //
4557 // TODO: Generalize to use getCheaperNegatedExpression which doesn't know
4558 // about cheaper constants.
4559 if (NewLHS.getOpcode() == ISD::FABS &&
4561 return SDValue();
4562
4564 return SDValue();
4565
4566 if (LHS.getOpcode() == ISD::FNEG)
4567 NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4568
4569 if (Inv)
4570 std::swap(NewLHS, NewRHS);
4571
4572 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
4573 Cond, NewLHS, NewRHS);
4574 DCI.AddToWorklist(NewSelect.getNode());
4575 return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
4576 }
4577 }
4578
4579 return SDValue();
4580}
4581
4583 DAGCombinerInfo &DCI) const {
4584 if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
4585 return Folded;
4586
4587 SDValue Cond = N->getOperand(0);
4588 if (Cond.getOpcode() != ISD::SETCC)
4589 return SDValue();
4590
4591 EVT VT = N->getValueType(0);
4592 SDValue LHS = Cond.getOperand(0);
4593 SDValue RHS = Cond.getOperand(1);
4594 SDValue CC = Cond.getOperand(2);
4595
4596 SDValue True = N->getOperand(1);
4597 SDValue False = N->getOperand(2);
4598
4599 if (Cond.hasOneUse()) { // TODO: Look for multiple select uses.
4600 SelectionDAG &DAG = DCI.DAG;
4601 if (DAG.isConstantValueOfAnyType(True) &&
4602 !DAG.isConstantValueOfAnyType(False)) {
4603 // Swap cmp + select pair to move constant to false input.
4604 // This will allow using VOPC cndmasks more often.
4605 // select (setcc x, y), k, x -> select (setccinv x, y), x, k
4606
4607 SDLoc SL(N);
4608 ISD::CondCode NewCC =
4609 getSetCCInverse(cast<CondCodeSDNode>(CC)->get(), LHS.getValueType());
4610
4611 SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC);
4612 return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True);
4613 }
4614
4615 if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) {
4617 = combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
4618 // Revisit this node so we can catch min3/max3/med3 patterns.
4619 //DCI.AddToWorklist(MinMax.getNode());
4620 return MinMax;
4621 }
4622 }
4623
4624 // There's no reason to not do this if the condition has other uses.
4625 return performCtlz_CttzCombine(SDLoc(N), Cond, True, False, DCI);
4626}
4627
4628static bool isInv2Pi(const APFloat &APF) {
4629 static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118));
4630 static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983));
4631 static const APFloat KF64(APFloat::IEEEdouble(), APInt(64, 0x3fc45f306dc9c882));
4632
4633 return APF.bitwiseIsEqual(KF16) ||
4634 APF.bitwiseIsEqual(KF32) ||
4635 APF.bitwiseIsEqual(KF64);
4636}
4637
4638// 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an
4639// additional cost to negate them.
4642 if (C->isZero())
4643 return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
4644
4645 if (Subtarget->hasInv2PiInlineImm() && isInv2Pi(C->getValueAPF()))
4646 return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
4647
4649}
4650
4654 return false;
4655}
4656
4660 return false;
4661}
4662
4663static unsigned inverseMinMax(unsigned Opc) {
4664 switch (Opc) {
4665 case ISD::FMAXNUM:
4666 return ISD::FMINNUM;
4667 case ISD::FMINNUM:
4668 return ISD::FMAXNUM;
4669 case ISD::FMAXNUM_IEEE:
4670 return ISD::FMINNUM_IEEE;
4671 case ISD::FMINNUM_IEEE:
4672 return ISD::FMAXNUM_IEEE;
4673 case ISD::FMAXIMUM:
4674 return ISD::FMINIMUM;
4675 case ISD::FMINIMUM:
4676 return ISD::FMAXIMUM;
4681 default:
4682 llvm_unreachable("invalid min/max opcode");
4683 }
4684}
4685
4686/// \return true if it's profitable to try to push an fneg into its source
4687/// instruction.
4689 // If the input has multiple uses and we can either fold the negate down, or
4690 // the other uses cannot, give up. This both prevents unprofitable
4691 // transformations and infinite loops: we won't repeatedly try to fold around
4692 // a negate that has no 'good' form.
4693 if (N0.hasOneUse()) {
4694 // This may be able to fold into the source, but at a code size cost. Don't
4695 // fold if the fold into the user is free.
4696 if (allUsesHaveSourceMods(N, 0))
4697 return false;
4698 } else {
4699 if (fnegFoldsIntoOp(N0.getNode()) &&
4701 return false;
4702 }
4703
4704 return true;
4705}
4706
4708 DAGCombinerInfo &DCI) const {
4709 SelectionDAG &DAG = DCI.DAG;
4710 SDValue N0 = N->getOperand(0);
4711 EVT VT = N->getValueType(0);
4712
4713 unsigned Opc = N0.getOpcode();
4714
4715 if (!shouldFoldFNegIntoSrc(N, N0))
4716 return SDValue();
4717
4718 SDLoc SL(N);
4719 switch (Opc) {
4720 case ISD::FADD: {
4721 if (!mayIgnoreSignedZero(N0))
4722 return SDValue();
4723
4724 // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
4725 SDValue LHS = N0.getOperand(0);
4726 SDValue RHS = N0.getOperand(1);
4727
4728 if (LHS.getOpcode() != ISD::FNEG)
4729 LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
4730 else
4731 LHS = LHS.getOperand(0);
4732
4733 if (RHS.getOpcode() != ISD::FNEG)
4734 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4735 else
4736 RHS = RHS.getOperand(0);
4737
4738 SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS, N0->getFlags());
4739 if (Res.getOpcode() != ISD::FADD)
4740 return SDValue(); // Op got folded away.
4741 if (!N0.hasOneUse())
4742 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
4743 return Res;
4744 }
4745 case ISD::FMUL:
4747 // (fneg (fmul x, y)) -> (fmul x, (fneg y))
4748 // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y))
4749 SDValue LHS = N0.getOperand(0);
4750 SDValue RHS = N0.getOperand(1);
4751
4752 if (LHS.getOpcode() == ISD::FNEG)
4753 LHS = LHS.getOperand(0);
4754 else if (RHS.getOpcode() == ISD::FNEG)
4755 RHS = RHS.getOperand(0);
4756 else
4757 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4758
4759 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS, N0->getFlags());
4760 if (Res.getOpcode() != Opc)
4761 return SDValue(); // Op got folded away.
4762 if (!N0.hasOneUse())
4763 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
4764 return Res;
4765 }
4766 case ISD::FMA:
4767 case ISD::FMAD: {
4768 // TODO: handle llvm.amdgcn.fma.legacy
4769 if (!mayIgnoreSignedZero(N0))
4770 return SDValue();
4771
4772 // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))
4773 SDValue LHS = N0.getOperand(0);
4774 SDValue MHS = N0.getOperand(1);
4775 SDValue RHS = N0.getOperand(2);
4776
4777 if (LHS.getOpcode() == ISD::FNEG)
4778 LHS = LHS.getOperand(0);
4779 else if (MHS.getOpcode() == ISD::FNEG)
4780 MHS = MHS.getOperand(0);
4781 else
4782 MHS = DAG.getNode(ISD::FNEG, SL, VT, MHS);
4783
4784 if (RHS.getOpcode() != ISD::FNEG)
4785 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4786 else
4787 RHS = RHS.getOperand(0);
4788
4789 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS);
4790 if (Res.getOpcode() != Opc)
4791 return SDValue(); // Op got folded away.
4792 if (!N0.hasOneUse())
4793 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
4794 return Res;
4795 }
4796 case ISD::FMAXNUM:
4797 case ISD::FMINNUM:
4798 case ISD::FMAXNUM_IEEE:
4799 case ISD::FMINNUM_IEEE:
4800 case ISD::FMINIMUM:
4801 case ISD::FMAXIMUM:
4804 // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)
4805 // fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y)
4806 // fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y)
4807 // fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y)
4808
4809 SDValue LHS = N0.getOperand(0);
4810 SDValue RHS = N0.getOperand(1);
4811
4812 // 0 doesn't have a negated inline immediate.
4813 // TODO: This constant check should be generalized to other operations.
4815 return SDValue();
4816
4817 SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
4818 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4819 unsigned Opposite = inverseMinMax(Opc);
4820
4821 SDValue Res = DAG.getNode(Opposite, SL, VT, NegLHS, NegRHS, N0->getFlags());
4822 if (Res.getOpcode() != Opposite)
4823 return SDValue(); // Op got folded away.
4824 if (!N0.hasOneUse())
4825 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
4826 return Res;
4827 }
4828 case AMDGPUISD::FMED3: {
4829 SDValue Ops[3];
4830 for (unsigned I = 0; I < 3; ++I)
4831 Ops[I] = DAG.getNode(ISD::FNEG, SL, VT, N0->getOperand(I), N0->getFlags());
4832
4833 SDValue Res = DAG.getNode(AMDGPUISD::FMED3, SL, VT, Ops, N0->getFlags());
4834 if (Res.getOpcode() != AMDGPUISD::FMED3)
4835 return SDValue(); // Op got folded away.
4836
4837 if (!N0.hasOneUse()) {
4838 SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Res);
4839 DAG.ReplaceAllUsesWith(N0, Neg);
4840
4841 for (SDNode *U : Neg->uses())
4842 DCI.AddToWorklist(U);
4843 }
4844
4845 return Res;
4846 }
4847 case ISD::FP_EXTEND:
4848 case ISD::FTRUNC:
4849 case ISD::FRINT:
4850 case ISD::FNEARBYINT: // XXX - Should fround be handled?
4851 case ISD::FROUNDEVEN:
4852 case ISD::FSIN:
4853 case ISD::FCANONICALIZE:
4854 case AMDGPUISD::RCP:
4857 case AMDGPUISD::SIN_HW: {
4858 SDValue CvtSrc = N0.getOperand(0);
4859 if (CvtSrc.getOpcode() == ISD::FNEG) {
4860 // (fneg (fp_extend (fneg x))) -> (fp_extend x)
4861 // (fneg (rcp (fneg x))) -> (rcp x)
4862 return DAG.getNode(Opc, SL, VT, CvtSrc.getOperand(0));
4863 }
4864
4865 if (!N0.hasOneUse())
4866 return SDValue();
4867
4868 // (fneg (fp_extend x)) -> (fp_extend (fneg x))
4869 // (fneg (rcp x)) -> (rcp (fneg x))
4870 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
4871 return DAG.getNode(Opc, SL, VT, Neg, N0->getFlags());
4872 }
4873 case ISD::FP_ROUND: {
4874 SDValue CvtSrc = N0.getOperand(0);
4875
4876 if (CvtSrc.getOpcode() == ISD::FNEG) {
4877 // (fneg (fp_round (fneg x))) -> (fp_round x)
4878 return DAG.getNode(ISD::FP_ROUND, SL, VT,
4879 CvtSrc.getOperand(0), N0.getOperand(1));
4880 }
4881
4882 if (!N0.hasOneUse())
4883 return SDValue();
4884
4885 // (fneg (fp_round x)) -> (fp_round (fneg x))
4886 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
4887 return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1));
4888 }
4889 case ISD::FP16_TO_FP: {
4890 // v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal
4891 // f16, but legalization of f16 fneg ends up pulling it out of the source.
4892 // Put the fneg back as a legal source operation that can be matched later.
4893 SDLoc SL(N);
4894
4895 SDValue Src = N0.getOperand(0);
4896 EVT SrcVT = Src.getValueType();
4897
4898 // fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000)
4899 SDValue IntFNeg = DAG.getNode(ISD::XOR, SL, SrcVT, Src,
4900 DAG.getConstant(0x8000, SL, SrcVT));
4901 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFNeg);
4902 }
4903 case ISD::SELECT: {
4904 // fneg (select c, a, b) -> select c, (fneg a), (fneg b)
4905 // TODO: Invert conditions of foldFreeOpFromSelect
4906 return SDValue();
4907 }
4908 case ISD::BITCAST: {
4909 SDLoc SL(N);
4910 SDValue BCSrc = N0.getOperand(0);
4911 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
4912 SDValue HighBits = BCSrc.getOperand(BCSrc.getNumOperands() - 1);
4913 if (HighBits.getValueType().getSizeInBits() != 32 ||
4914 !fnegFoldsIntoOp(HighBits.getNode()))
4915 return SDValue();
4916
4917 // f64 fneg only really needs to operate on the high half of of the
4918 // register, so try to force it to an f32 operation to help make use of
4919 // source modifiers.
4920 //
4921 //
4922 // fneg (f64 (bitcast (build_vector x, y))) ->
4923 // f64 (bitcast (build_vector (bitcast i32:x to f32),
4924 // (fneg (bitcast i32:y to f32)))
4925
4926 SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::f32, HighBits);
4927 SDValue NegHi = DAG.getNode(ISD::FNEG, SL, MVT::f32, CastHi);
4928 SDValue CastBack =
4929 DAG.getNode(ISD::BITCAST, SL, HighBits.getValueType(), NegHi);
4930
4931 SmallVector<SDValue, 8> Ops(BCSrc->op_begin(), BCSrc->op_end());
4932 Ops.back() = CastBack;
4933 DCI.AddToWorklist(NegHi.getNode());
4934 SDValue Build =
4935 DAG.getNode(ISD::BUILD_VECTOR, SL, BCSrc.getValueType(), Ops);
4936 SDValue Result = DAG.getNode(ISD::BITCAST, SL, VT, Build);
4937
4938 if (!N0.hasOneUse())
4939 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Result));
4940 return Result;
4941 }
4942
4943 if (BCSrc.getOpcode() == ISD::SELECT && VT == MVT::f32 &&
4944 BCSrc.hasOneUse()) {
4945 // fneg (bitcast (f32 (select cond, i32:lhs, i32:rhs))) ->
4946 // select cond, (bitcast i32:lhs to f32), (bitcast i32:rhs to f32)
4947
4948 // TODO: Cast back result for multiple uses is beneficial in some cases.
4949
4950 SDValue LHS =
4951 DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(1));
4952 SDValue RHS =
4953 DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(2));
4954
4955 SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, LHS);
4956 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHS);
4957
4958 return DAG.getNode(ISD::SELECT, SL, MVT::f32, BCSrc.getOperand(0), NegLHS,
4959 NegRHS);
4960 }
4961
4962 return SDValue();
4963 }
4964 default:
4965 return SDValue();
4966 }
4967}
4968
4970 DAGCombinerInfo &DCI) const {
4971 SelectionDAG &DAG = DCI.DAG;
4972 SDValue N0 = N->getOperand(0);
4973
4974 if (!N0.hasOneUse())
4975 return SDValue();
4976
4977 switch (N0.getOpcode()) {
4978 case ISD::FP16_TO_FP: {
4979 assert(!Subtarget->has16BitInsts() && "should only see if f16 is illegal");
4980 SDLoc SL(N);
4981 SDValue Src = N0.getOperand(0);
4982 EVT SrcVT = Src.getValueType();
4983
4984 // fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff)
4985 SDValue IntFAbs = DAG.getNode(ISD::AND, SL, SrcVT, Src,
4986 DAG.getConstant(0x7fff, SL, SrcVT));
4987 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFAbs);
4988 }
4989 default:
4990 return SDValue();
4991 }
4992}
4993
4995 DAGCombinerInfo &DCI) const {
4996 const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
4997 if (!CFP)
4998 return SDValue();
4999
5000 // XXX - Should this flush denormals?
5001 const APFloat &Val = CFP->getValueAPF();
5002 APFloat One(Val.getSemantics(), "1.0");
5003 return DCI.DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0));
5004}
5005
5007 DAGCombinerInfo &DCI) const {
5008 SelectionDAG &DAG = DCI.DAG;
5009 SDLoc DL(N);
5010
5011 switch(N->getOpcode()) {
5012 default:
5013 break;
5014 case ISD::BITCAST: {
5015 EVT DestVT = N->getValueType(0);
5016
5017 // Push casts through vector builds. This helps avoid emitting a large
5018 // number of copies when materializing floating point vector constants.
5019 //
5020 // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) =>
5021 // vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y))
5022 if (DestVT.isVector()) {
5023 SDValue Src = N->getOperand(0);
5024 if (Src.getOpcode() == ISD::BUILD_VECTOR &&
5027 EVT SrcVT = Src.getValueType();
5028 unsigned NElts = DestVT.getVectorNumElements();
5029
5030 if (SrcVT.getVectorNumElements() == NElts) {
5031 EVT DestEltVT = DestVT.getVectorElementType();
5032
5033 SmallVector<SDValue, 8> CastedElts;
5034 SDLoc SL(N);
5035 for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) {
5036 SDValue Elt = Src.getOperand(I);
5037 CastedElts.push_back(DAG.getNode(ISD::BITCAST, DL, DestEltVT, Elt));
5038 }
5039
5040 return DAG.getBuildVector(DestVT, SL, CastedElts);
5041 }
5042 }
5043 }
5044
5045 if (DestVT.getSizeInBits() != 64 || !DestVT.isVector())
5046 break;
5047
5048 // Fold bitcasts of constants.
5049 //
5050 // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k)
5051 // TODO: Generalize and move to DAGCombiner
5052 SDValue Src = N->getOperand(0);
5053 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src)) {
5054 SDLoc SL(N);
5055 uint64_t CVal = C->getZExtValue();
5056 SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
5057 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
5058 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
5059 return DAG.getNode(ISD::BITCAST, SL, DestVT, BV);
5060 }
5061
5062 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Src)) {
5063 const APInt &Val = C->getValueAPF().bitcastToAPInt();
5064 SDLoc SL(N);
5065 uint64_t CVal = Val.getZExtValue();
5066 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
5067 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
5068 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
5069
5070 return DAG.getNode(ISD::BITCAST, SL, DestVT, Vec);
5071 }
5072
5073 break;
5074 }
5075 case ISD::SHL: {
5077 break;
5078
5079 return performShlCombine(N, DCI);
5080 }
5081 case ISD::SRL: {
5083 break;
5084
5085 return performSrlCombine(N, DCI);
5086 }
5087 case ISD::SRA: {
5089 break;
5090
5091 return performSraCombine(N, DCI);
5092 }
5093 case ISD::TRUNCATE:
5094 return performTruncateCombine(N, DCI);
5095 case ISD::MUL:
5096 return performMulCombine(N, DCI);
5097 case AMDGPUISD::MUL_U24:
5098 case AMDGPUISD::MUL_I24: {
5099 if (SDValue Simplified = simplifyMul24(N, DCI))
5100 return Simplified;
5101 break;
5102 }
5105 return simplifyMul24(N, DCI);
5106 case ISD::SMUL_LOHI:
5107 case ISD::UMUL_LOHI:
5108 return performMulLoHiCombine(N, DCI);
5109 case ISD::MULHS:
5110 return performMulhsCombine(N, DCI);
5111 case ISD::MULHU:
5112 return performMulhuCombine(N, DCI);
5113 case ISD::SELECT:
5114 return performSelectCombine(N, DCI);
5115 case ISD::FNEG:
5116 return performFNegCombine(N, DCI);
5117 case ISD::FABS:
5118 return performFAbsCombine(N, DCI);
5119 case AMDGPUISD::BFE_I32:
5120 case AMDGPUISD::BFE_U32: {
5121 assert(!N->getValueType(0).isVector() &&
5122 "Vector handling of BFE not implemented");
5123 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
5124 if (!Width)
5125 break;
5126
5127 uint32_t WidthVal = Width->getZExtValue() & 0x1f;
5128 if (WidthVal == 0)
5129 return DAG.getConstant(0, DL, MVT::i32);
5130
5131 ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
5132 if (!Offset)
5133 break;
5134
5135 SDValue BitsFrom = N->getOperand(0);
5136 uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;
5137
5138 bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
5139
5140 if (OffsetVal == 0) {
5141 // This is already sign / zero extended, so try to fold away extra BFEs.
5142 unsigned SignBits = Signed ? (32 - WidthVal + 1) : (32 - WidthVal);
5143
5144 unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom);
5145 if (OpSignBits >= SignBits)
5146 return BitsFrom;
5147
5148 EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal);
5149 if (Signed) {
5150 // This is a sign_extend_inreg. Replace it to take advantage of existing
5151 // DAG Combines. If not eliminated, we will match back to BFE during
5152 // selection.
5153
5154 // TODO: The sext_inreg of extended types ends, although we can could
5155 // handle them in a single BFE.
5156 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom,
5157 DAG.getValueType(SmallVT));
5158 }
5159
5160 return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT);
5161 }
5162
5163 if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(BitsFrom)) {
5164 if (Signed) {
5165 return constantFoldBFE<int32_t>(DAG,
5166 CVal->getSExtValue(),
5167 OffsetVal,
5168 WidthVal,
5169 DL);
5170 }
5171
5172 return constantFoldBFE<uint32_t>(DAG,
5173 CVal->getZExtValue(),
5174 OffsetVal,
5175 WidthVal,
5176 DL);
5177 }
5178
5179 if ((OffsetVal + WidthVal) >= 32 &&
5180 !(Subtarget->hasSDWA() && OffsetVal == 16 && WidthVal == 16)) {
5181 SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32);
5182 return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
5183 BitsFrom, ShiftVal);
5184 }
5185
5186 if (BitsFrom.hasOneUse()) {
5187 APInt Demanded = APInt::getBitsSet(32,
5188 OffsetVal,
5189 OffsetVal + WidthVal);
5190
5191 KnownBits Known;
5193 !DCI.isBeforeLegalizeOps());
5194 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
5195 if (TLI.ShrinkDemandedConstant(BitsFrom, Demanded, TLO) ||
5196 TLI.SimplifyDemandedBits(BitsFrom, Demanded, Known, TLO)) {
5197 DCI.CommitTargetLoweringOpt(TLO);
5198 }
5199 }
5200
5201 break;
5202 }
5203 case ISD::LOAD:
5204 return performLoadCombine(N, DCI);
5205 case ISD::STORE:
5206 return performStoreCombine(N, DCI);
5207 case AMDGPUISD::RCP:
5209 return performRcpCombine(N, DCI);
5210 case ISD::AssertZext:
5211 case ISD::AssertSext:
5212 return performAssertSZExtCombine(N, DCI);
5214 return performIntrinsicWOChainCombine(N, DCI);
5215 case AMDGPUISD::FMAD_FTZ: {
5216 SDValue N0 = N->getOperand(0);
5217 SDValue N1 = N->getOperand(1);
5218 SDValue N2 = N->getOperand(2);
5219 EVT VT = N->getValueType(0);
5220
5221 // FMAD_FTZ is a FMAD + flush denormals to zero.
5222 // We flush the inputs, the intermediate step, and the output.
5223 ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
5224 ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
5225 ConstantFPSDNode *N2CFP = dyn_cast<ConstantFPSDNode>(N2);
5226 if (N0CFP && N1CFP && N2CFP) {
5227 const auto FTZ = [](const APFloat &V) {
5228 if (V.isDenormal()) {
5229 APFloat Zero(V.getSemantics(), 0);
5230 return V.isNegative() ? -Zero : Zero;
5231 }
5232 return V;
5233 };
5234
5235 APFloat V0 = FTZ(N0CFP->getValueAPF());
5236 APFloat V1 = FTZ(N1CFP->getValueAPF());
5237 APFloat V2 = FTZ(N2CFP->getValueAPF());
5239 V0 = FTZ(V0);
5241 return DAG.getConstantFP(FTZ(V0), DL, VT);
5242 }
5243 break;
5244 }
5245 }
5246 return SDValue();
5247}
5248
5249//===----------------------------------------------------------------------===//
5250// Helper functions
5251//===----------------------------------------------------------------------===//
5252
5254 const TargetRegisterClass *RC,
5255 Register Reg, EVT VT,
5256 const SDLoc &SL,
5257 bool RawReg) const {
5260 Register VReg;
5261
5262 if (!MRI.isLiveIn(Reg)) {
5263 VReg = MRI.createVirtualRegister(RC);
5264 MRI.addLiveIn(Reg, VReg);
5265 } else {
5266 VReg = MRI.getLiveInVirtReg(Reg);
5267 }
5268
5269 if (RawReg)
5270 return DAG.getRegister(VReg, VT);
5271
5272 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT);
5273}
5274
5275// This may be called multiple times, and nothing prevents creating multiple
5276// objects at the same offset. See if we already defined this object.
5278 int64_t Offset) {
5279 for (int I = MFI.getObjectIndexBegin(); I < 0; ++I) {
5280 if (MFI.getObjectOffset(I) == Offset) {
5281 assert(MFI.getObjectSize(I) == Size);
5282 return I;
5283 }
5284 }
5285
5286 return MFI.CreateFixedObject(Size, Offset, true);
5287}
5288
5290 EVT VT,
5291 const SDLoc &SL,
5292 int64_t Offset) const {
5294 MachineFrameInfo &MFI = MF.getFrameInfo();
5295 int FI = getOrCreateFixedStackObject(MFI, VT.getStoreSize(), Offset);
5296
5297 auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset);
5298 SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32);
5299
5300 return DAG.getLoad(VT, SL, DAG.getEntryNode(), Ptr, SrcPtrInfo, Align(4),
5303}
5304
5306 const SDLoc &SL,
5307 SDValue Chain,
5308 SDValue ArgVal,
5309 int64_t Offset) const {
5313
5314 SDValue Ptr = DAG.getConstant(Offset, SL, MVT::i32);
5315 // Stores to the argument stack area are relative to the stack pointer.
5316 SDValue SP =
5317 DAG.getCopyFromReg(Chain, SL, Info->getStackPtrOffsetReg(), MVT::i32);
5318 Ptr = DAG.getNode(ISD::ADD, SL, MVT::i32, SP, Ptr);
5319 SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, Align(4),
5321 return Store;
5322}
5323
5325 const TargetRegisterClass *RC,
5326 EVT VT, const SDLoc &SL,
5327 const ArgDescriptor &Arg) const {
5328 assert(Arg && "Attempting to load missing argument");
5329
5330 SDValue V = Arg.isRegister() ?
5331 CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL) :
5332 loadStackInputValue(DAG, VT, SL, Arg.getStackOffset());
5333
5334 if (!Arg.isMasked())
5335 return V;
5336
5337 unsigned Mask = Arg.getMask();
5338 unsigned Shift = llvm::countr_zero<unsigned>(Mask);
5339 V = DAG.getNode(ISD::SRL, SL, VT, V,
5340 DAG.getShiftAmountConstant(Shift, VT, SL));
5341 return DAG.getNode(ISD::AND, SL, VT, V,
5342 DAG.getConstant(Mask >> Shift, SL, VT));
5343}
5344
5346 uint64_t ExplicitKernArgSize, const ImplicitParameter Param) const {
5347 unsigned ExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
5348 const Align Alignment = Subtarget->getAlignmentForImplicitArgPtr();
5349 uint64_t ArgOffset =
5350 alignTo(ExplicitKernArgSize, Alignment) + ExplicitArgOffset;
5351 switch (Param) {
5352 case FIRST_IMPLICIT:
5353 return ArgOffset;
5354 case PRIVATE_BASE:
5356 case SHARED_BASE:
5357 return ArgOffset + AMDGPU::ImplicitArg::SHARED_BASE_OFFSET;
5358 case QUEUE_PTR:
5359 return ArgOffset + AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET;
5360 }
5361 llvm_unreachable("unexpected implicit parameter type");
5362}
5363
5365 const MachineFunction &MF, const ImplicitParameter Param) const {
5368}
5369
5370#define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node;
5371
5372const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
5373 switch ((AMDGPUISD::NodeType)Opcode) {
5374 case AMDGPUISD::FIRST_NUMBER: break;
5375 // AMDIL DAG nodes
5376 NODE_NAME_CASE(UMUL);
5377 NODE_NAME_CASE(BRANCH_COND);
5378
5379 // AMDGPU DAG nodes
5380 NODE_NAME_CASE(IF)
5381 NODE_NAME_CASE(ELSE)
5382 NODE_NAME_CASE(LOOP)
5383 NODE_NAME_CASE(CALL)
5384 NODE_NAME_CASE(TC_RETURN)
5385 NODE_NAME_CASE(TC_RETURN_GFX)
5386 NODE_NAME_CASE(TC_RETURN_CHAIN)
5387 NODE_NAME_CASE(TRAP)
5388 NODE_NAME_CASE(RET_GLUE)
5389 NODE_NAME_CASE(WAVE_ADDRESS)
5390 NODE_NAME_CASE(RETURN_TO_EPILOG)
5391 NODE_NAME_CASE(ENDPGM)
5392 NODE_NAME_CASE(ENDPGM_TRAP)
5393 NODE_NAME_CASE(SIMULATED_TRAP)
5394 NODE_NAME_CASE(DWORDADDR)
5395 NODE_NAME_CASE(FRACT)
5396 NODE_NAME_CASE(SETCC)
5397 NODE_NAME_CASE(SETREG)
5398 NODE_NAME_CASE(DENORM_MODE)
5399 NODE_NAME_CASE(FMA_W_CHAIN)
5400 NODE_NAME_CASE(FMUL_W_CHAIN)
5401 NODE_NAME_CASE(CLAMP)
5402 NODE_NAME_CASE(COS_HW)
5403 NODE_NAME_CASE(SIN_HW)
5404 NODE_NAME_CASE(FMAX_LEGACY)
5405 NODE_NAME_CASE(FMIN_LEGACY)
5406 NODE_NAME_CASE(FMAX3)
5407 NODE_NAME_CASE(SMAX3)
5408 NODE_NAME_CASE(UMAX3)
5409 NODE_NAME_CASE(FMIN3)
5410 NODE_NAME_CASE(SMIN3)
5411 NODE_NAME_CASE(UMIN3)
5412 NODE_NAME_CASE(FMED3)
5413 NODE_NAME_CASE(SMED3)
5414 NODE_NAME_CASE(UMED3)
5415 NODE_NAME_CASE(FMAXIMUM3)
5416 NODE_NAME_CASE(FMINIMUM3)
5417 NODE_NAME_CASE(FDOT2)
5418 NODE_NAME_CASE(URECIP)
5419 NODE_NAME_CASE(DIV_SCALE)
5420 NODE_NAME_CASE(DIV_FMAS)
5421 NODE_NAME_CASE(DIV_FIXUP)
5422 NODE_NAME_CASE(FMAD_FTZ)
5423 NODE_NAME_CASE(RCP)
5424 NODE_NAME_CASE(RSQ)
5425 NODE_NAME_CASE(RCP_LEGACY)
5426 NODE_NAME_CASE(RCP_IFLAG)
5427 NODE_NAME_CASE(LOG)
5428 NODE_NAME_CASE(EXP)
5429 NODE_NAME_CASE(FMUL_LEGACY)
5430 NODE_NAME_CASE(RSQ_CLAMP)
5431 NODE_NAME_CASE(FP_CLASS)
5432 NODE_NAME_CASE(DOT4)
5433 NODE_NAME_CASE(CARRY)
5434 NODE_NAME_CASE(BORROW)
5435 NODE_NAME_CASE(BFE_U32)
5436 NODE_NAME_CASE(BFE_I32)
5437 NODE_NAME_CASE(BFI)
5438 NODE_NAME_CASE(BFM)
5439 NODE_NAME_CASE(FFBH_U32)
5440 NODE_NAME_CASE(FFBH_I32)
5441 NODE_NAME_CASE(FFBL_B32)
5442 NODE_NAME_CASE(MUL_U24)
5443 NODE_NAME_CASE(MUL_I24)
5444 NODE_NAME_CASE(MULHI_U24)
5445 NODE_NAME_CASE(MULHI_I24)
5446 NODE_NAME_CASE(MAD_U24)
5447 NODE_NAME_CASE(MAD_I24)
5448 NODE_NAME_CASE(MAD_I64_I32)
5449 NODE_NAME_CASE(MAD_U64_U32)
5450 NODE_NAME_CASE(PERM)
5451 NODE_NAME_CASE(TEXTURE_FETCH)
5452 NODE_NAME_CASE(R600_EXPORT)
5453 NODE_NAME_CASE(CONST_ADDRESS)
5454 NODE_NAME_CASE(REGISTER_LOAD)
5455 NODE_NAME_CASE(REGISTER_STORE)
5456 NODE_NAME_CASE(SAMPLE)
5457 NODE_NAME_CASE(SAMPLEB)
5458 NODE_NAME_CASE(SAMPLED)
5459 NODE_NAME_CASE(SAMPLEL)
5460 NODE_NAME_CASE(CVT_F32_UBYTE0)
5461 NODE_NAME_CASE(CVT_F32_UBYTE1)
5462 NODE_NAME_CASE(CVT_F32_UBYTE2)
5463 NODE_NAME_CASE(CVT_F32_UBYTE3)
5464 NODE_NAME_CASE(CVT_PKRTZ_F16_F32)
5465 NODE_NAME_CASE(CVT_PKNORM_I16_F32)
5466 NODE_NAME_CASE(CVT_PKNORM_U16_F32)
5467 NODE_NAME_CASE(CVT_PK_I16_I32)
5468 NODE_NAME_CASE(CVT_PK_U16_U32)
5469 NODE_NAME_CASE(FP_TO_FP16)
5470 NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
5471 NODE_NAME_CASE(CONST_DATA_PTR)
5472 NODE_NAME_CASE(PC_ADD_REL_OFFSET)
5474 NODE_NAME_CASE(FPTRUNC_ROUND_UPWARD)
5475 NODE_NAME_CASE(FPTRUNC_ROUND_DOWNWARD)
5476 NODE_NAME_CASE(DUMMY_CHAIN)
5478 NODE_NAME_CASE(LOAD_D16_HI)
5479 NODE_NAME_CASE(LOAD_D16_LO)
5480 NODE_NAME_CASE(LOAD_D16_HI_I8)
5481 NODE_NAME_CASE(LOAD_D16_HI_U8)
5482 NODE_NAME_CASE(LOAD_D16_LO_I8)
5483 NODE_NAME_CASE(LOAD_D16_LO_U8)
5484 NODE_NAME_CASE(STORE_MSKOR)
5485 NODE_NAME_CASE(LOAD_CONSTANT)
5486 NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
5487 NODE_NAME_CASE(TBUFFER_STORE_FORMAT_D16)
5488 NODE_NAME_CASE(TBUFFER_LOAD_FORMAT)
5489 NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16)
5490 NODE_NAME_CASE(DS_ORDERED_COUNT)
5491 NODE_NAME_CASE(ATOMIC_CMP_SWAP)
5492 NODE_NAME_CASE(ATOMIC_LOAD_FMIN)
5493 NODE_NAME_CASE(ATOMIC_LOAD_FMAX)
5494 NODE_NAME_CASE(BUFFER_LOAD)
5495 NODE_NAME_CASE(BUFFER_LOAD_UBYTE)
5496 NODE_NAME_CASE(BUFFER_LOAD_USHORT)
5497 NODE_NAME_CASE(BUFFER_LOAD_BYTE)
5498 NODE_NAME_CASE(BUFFER_LOAD_SHORT)
5499 NODE_NAME_CASE(BUFFER_LOAD_FORMAT)
5500 NODE_NAME_CASE(BUFFER_LOAD_FORMAT_TFE)
5501 NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16)
5502 NODE_NAME_CASE(SBUFFER_LOAD)
5503 NODE_NAME_CASE(SBUFFER_LOAD_BYTE)
5504 NODE_NAME_CASE(SBUFFER_LOAD_UBYTE)
5505 NODE_NAME_CASE(SBUFFER_LOAD_SHORT)
5506 NODE_NAME_CASE(SBUFFER_LOAD_USHORT)
5507 NODE_NAME_CASE(BUFFER_STORE)
5508 NODE_NAME_CASE(BUFFER_STORE_BYTE)
5509 NODE_NAME_CASE(BUFFER_STORE_SHORT)
5510 NODE_NAME_CASE(BUFFER_STORE_FORMAT)
5511 NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16)
5512 NODE_NAME_CASE(BUFFER_ATOMIC_SWAP)
5513 NODE_NAME_CASE(BUFFER_ATOMIC_ADD)
5514 NODE_NAME_CASE(BUFFER_ATOMIC_SUB)
5515 NODE_NAME_CASE(BUFFER_ATOMIC_SMIN)
5516 NODE_NAME_CASE(BUFFER_ATOMIC_UMIN)
5517 NODE_NAME_CASE(BUFFER_ATOMIC_SMAX)
5518 NODE_NAME_CASE(BUFFER_ATOMIC_UMAX)
5519 NODE_NAME_CASE(BUFFER_ATOMIC_AND)
5520 NODE_NAME_CASE(BUFFER_ATOMIC_OR)
5521 NODE_NAME_CASE(BUFFER_ATOMIC_XOR)
5522 NODE_NAME_CASE(BUFFER_ATOMIC_INC)
5523 NODE_NAME_CASE(BUFFER_ATOMIC_DEC)
5524 NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP)
5525 NODE_NAME_CASE(BUFFER_ATOMIC_CSUB)
5526 NODE_NAME_CASE(BUFFER_ATOMIC_FADD)
5527 NODE_NAME_CASE(BUFFER_ATOMIC_FADD_BF16)
5528 NODE_NAME_CASE(BUFFER_ATOMIC_FMIN)
5529 NODE_NAME_CASE(BUFFER_ATOMIC_FMAX)
5530 NODE_NAME_CASE(BUFFER_ATOMIC_COND_SUB_U32)
5531
5533 }
5534 return nullptr;
5535}
5536
5538 SelectionDAG &DAG, int Enabled,
5539 int &RefinementSteps,
5540 bool &UseOneConstNR,
5541 bool Reciprocal) const {
5542 EVT VT = Operand.getValueType();
5543
5544 if (VT == MVT::f32) {
5545 RefinementSteps = 0;
5546 return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand);
5547 }
5548
5549 // TODO: There is also f64 rsq instruction, but the documentation is less
5550 // clear on its precision.
5551
5552 return SDValue();
5553}
5554
5556 SelectionDAG &DAG, int Enabled,
5557 int &RefinementSteps) const {
5558 EVT VT = Operand.getValueType();
5559
5560 if (VT == MVT::f32) {
5561 // Reciprocal, < 1 ulp error.
5562 //
5563 // This reciprocal approximation converges to < 0.5 ulp error with one
5564 // newton rhapson performed with two fused multiple adds (FMAs).
5565
5566 RefinementSteps = 0;
5567 return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand);
5568 }
5569
5570 // TODO: There is also f64 rcp instruction, but the documentation is less
5571 // clear on its precision.
5572
5573 return SDValue();
5574}
5575
5576static unsigned workitemIntrinsicDim(unsigned ID) {
5577 switch (ID) {
5578 case Intrinsic::amdgcn_workitem_id_x:
5579 return 0;
5580 case Intrinsic::amdgcn_workitem_id_y:
5581 return 1;
5582 case Intrinsic::amdgcn_workitem_id_z:
5583 return 2;
5584 default:
5585 llvm_unreachable("not a workitem intrinsic");
5586 }
5587}
5588
5590 const SDValue Op, KnownBits &Known,
5591 const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
5592
5593 Known.resetAll(); // Don't know anything.
5594
5595 unsigned Opc = Op.getOpcode();
5596
5597 switch (Opc) {
5598 default:
5599 break;
5600 case AMDGPUISD::CARRY:
5601 case AMDGPUISD::BORROW: {
5602 Known.Zero = APInt::getHighBitsSet(32, 31);
5603 break;
5604 }
5605
5606 case AMDGPUISD::BFE_I32:
5607 case AMDGPUISD::BFE_U32: {
5608 ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5609 if (!CWidth)
5610 return;
5611
5612 uint32_t Width = CWidth->getZExtValue() & 0x1f;
5613
5614 if (Opc == AMDGPUISD::BFE_U32)
5615 Known.Zero = APInt::getHighBitsSet(32, 32 - Width);
5616
5617 break;
5618 }
5619 case AMDGPUISD::FP_TO_FP16: {
5620 unsigned BitWidth = Known.getBitWidth();
5621
5622 // High bits are zero.
5624 break;
5625 }
5626 case AMDGPUISD::MUL_U24:
5627 case AMDGPUISD::MUL_I24: {
5628 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
5629 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
5630 unsigned TrailZ = LHSKnown.countMinTrailingZeros() +
5631 RHSKnown.countMinTrailingZeros();
5632 Known.Zero.setLowBits(std::min(TrailZ, 32u));
5633 // Skip extra check if all bits are known zeros.
5634 if (TrailZ >= 32)
5635 break;
5636
5637 // Truncate to 24 bits.
5638 LHSKnown = LHSKnown.trunc(24);
5639 RHSKnown = RHSKnown.trunc(24);
5640
5641 if (Opc == AMDGPUISD::MUL_I24) {
5642 unsigned LHSValBits = LHSKnown.countMaxSignificantBits();
5643 unsigned RHSValBits = RHSKnown.countMaxSignificantBits();
5644 unsigned MaxValBits = LHSValBits + RHSValBits;
5645 if (MaxValBits > 32)
5646 break;
5647 unsigned SignBits = 32 - MaxValBits + 1;
5648 bool LHSNegative = LHSKnown.isNegative();
5649 bool LHSNonNegative = LHSKnown.isNonNegative();
5650 bool LHSPositive = LHSKnown.isStrictlyPositive();
5651 bool RHSNegative = RHSKnown.isNegative();
5652 bool RHSNonNegative = RHSKnown.isNonNegative();
5653 bool RHSPositive = RHSKnown.isStrictlyPositive();
5654
5655 if ((LHSNonNegative && RHSNonNegative) || (LHSNegative && RHSNegative))
5656 Known.Zero.setHighBits(SignBits);
5657 else if ((LHSNegative && RHSPositive) || (LHSPositive && RHSNegative))
5658 Known.One.setHighBits(SignBits);
5659 } else {
5660 unsigned LHSValBits = LHSKnown.countMaxActiveBits();
5661 unsigned RHSValBits = RHSKnown.countMaxActiveBits();
5662 unsigned MaxValBits = LHSValBits + RHSValBits;
5663 if (MaxValBits >= 32)
5664 break;
5665 Known.Zero.setBitsFrom(MaxValBits);
5666 }
5667 break;
5668 }
5669 case AMDGPUISD::PERM: {
5670 ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5671 if (!CMask)
5672 return;
5673
5674 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
5675 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
5676 unsigned Sel = CMask->getZExtValue();
5677
5678 for (unsigned I = 0; I < 32; I += 8) {
5679 unsigned SelBits = Sel & 0xff;
5680 if (SelBits < 4) {
5681 SelBits *= 8;
5682 Known.One |= ((RHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
5683 Known.Zero |= ((RHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
5684 } else if (SelBits < 7) {
5685 SelBits = (SelBits & 3) * 8;
5686 Known.One |= ((LHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
5687 Known.Zero |= ((LHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
5688 } else if (SelBits == 0x0c) {
5689 Known.Zero |= 0xFFull << I;
5690 } else if (SelBits > 0x0c) {
5691 Known.One |= 0xFFull << I;
5692 }
5693 Sel >>= 8;
5694 }
5695 break;
5696 }
5698 Known.Zero.setHighBits(24);
5699 break;
5700 }
5702 Known.Zero.setHighBits(16);
5703 break;
5704 }
5705 case AMDGPUISD::LDS: {
5706 auto GA = cast<GlobalAddressSDNode>(Op.getOperand(0).getNode());
5707 Align Alignment = GA->getGlobal()->getPointerAlignment(DAG.getDataLayout());
5708
5709 Known.Zero.setHighBits(16);
5710 Known.Zero.setLowBits(Log2(Alignment));
5711 break;
5712 }
5713 case AMDGPUISD::SMIN3:
5714 case AMDGPUISD::SMAX3:
5715 case AMDGPUISD::SMED3:
5716 case AMDGPUISD::UMIN3:
5717 case AMDGPUISD::UMAX3:
5718 case AMDGPUISD::UMED3: {
5719 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
5720 if (Known2.isUnknown())
5721 break;
5722
5723 KnownBits Known1 = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
5724 if (Known1.isUnknown())
5725 break;
5726
5727 KnownBits Known0 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
5728 if (Known0.isUnknown())
5729 break;
5730
5731 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
5732 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
5733 Known.One = Known0.One & Known1.One & Known2.One;
5734 break;
5735 }
5737 unsigned IID = Op.getConstantOperandVal(0);
5738 switch (IID) {
5739 case Intrinsic::amdgcn_workitem_id_x:
5740 case Intrinsic::amdgcn_workitem_id_y:
5741 case Intrinsic::amdgcn_workitem_id_z: {
5742 unsigned MaxValue = Subtarget->getMaxWorkitemID(
5744 Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
5745 break;
5746 }
5747 default:
5748 break;
5749 }
5750 }
5751 }
5752}
5753
5755 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
5756 unsigned Depth) const {
5757 switch (Op.getOpcode()) {
5758 case AMDGPUISD::BFE_I32: {
5759 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5760 if (!Width)
5761 return 1;
5762
5763 unsigned SignBits = 32 - Width->getZExtValue() + 1;
5764 if (!isNullConstant(Op.getOperand(1)))
5765 return SignBits;
5766
5767 // TODO: Could probably figure something out with non-0 offsets.
5768 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
5769 return std::max(SignBits, Op0SignBits);
5770 }
5771
5772 case AMDGPUISD::BFE_U32: {
5773 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5774 return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1;
5775 }
5776
5777 case AMDGPUISD::CARRY:
5778 case AMDGPUISD::BORROW:
5779 return 31;
5781 return 25;
5783 return 17;
5785 return 24;
5787 return 16;
5789 return 16;
5790 case AMDGPUISD::SMIN3:
5791 case AMDGPUISD::SMAX3:
5792 case AMDGPUISD::SMED3:
5793 case AMDGPUISD::UMIN3:
5794 case AMDGPUISD::UMAX3:
5795 case AMDGPUISD::UMED3: {
5796 unsigned Tmp2 = DAG.ComputeNumSignBits(Op.getOperand(2), Depth + 1);
5797 if (Tmp2 == 1)
5798 return 1; // Early out.
5799
5800 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth + 1);
5801 if (Tmp1 == 1)
5802 return 1; // Early out.
5803
5804 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
5805 if (Tmp0 == 1)
5806 return 1; // Early out.
5807
5808 return std::min(Tmp0, std::min(Tmp1, Tmp2));
5809 }
5810 default:
5811 return 1;
5812 }
5813}
5814
5817 const APInt &DemandedElts, const MachineRegisterInfo &MRI,
5818 unsigned Depth) const {
5819 const MachineInstr *MI = MRI.getVRegDef(R);
5820 if (!MI)
5821 return 1;
5822
5823 // TODO: Check range metadata on MMO.
5824 switch (MI->getOpcode()) {
5825 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
5826 return 25;
5827 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
5828 return 17;
5829 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
5830 return 24;
5831 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
5832 return 16;
5833 case AMDGPU::G_AMDGPU_SMED3:
5834 case AMDGPU::G_AMDGPU_UMED3: {
5835 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
5836 unsigned Tmp2 = Analysis.computeNumSignBits(Src2, DemandedElts, Depth + 1);
5837 if (Tmp2 == 1)
5838 return 1;
5839 unsigned Tmp1 = Analysis.computeNumSignBits(Src1, DemandedElts, Depth + 1);
5840 if (Tmp1 == 1)
5841 return 1;
5842 unsigned Tmp0 = Analysis.computeNumSignBits(Src0, DemandedElts, Depth + 1);
5843 if (Tmp0 == 1)
5844 return 1;
5845 return std::min(Tmp0, std::min(Tmp1, Tmp2));
5846 }
5847 default:
5848 return 1;
5849 }
5850}
5851
5853 const SelectionDAG &DAG,
5854 bool SNaN,
5855 unsigned Depth) const {
5856 unsigned Opcode = Op.getOpcode();
5857 switch (Opcode) {
5860 if (SNaN)
5861 return true;
5862
5863 // TODO: Can check no nans on one of the operands for each one, but which
5864 // one?
5865 return false;
5866 }
5869 if (SNaN)
5870 return true;
5871 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
5872 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
5873 }
5874 case AMDGPUISD::FMED3:
5875 case AMDGPUISD::FMIN3:
5876 case AMDGPUISD::FMAX3:
5879 case AMDGPUISD::FMAD_FTZ: {
5880 if (SNaN)
5881 return true;
5882 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
5883 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
5884 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
5885 }
5890 return true;
5891
5892 case AMDGPUISD::RCP:
5893 case AMDGPUISD::RSQ:
5895 case AMDGPUISD::RSQ_CLAMP: {
5896 if (SNaN)
5897 return true;
5898
5899 // TODO: Need is known positive check.
5900 return false;
5901 }
5902 case ISD::FLDEXP:
5903 case AMDGPUISD::FRACT: {
5904 if (SNaN)
5905 return true;
5906 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
5907 }
5911 // TODO: Refine on operands.
5912 return SNaN;
5913 case AMDGPUISD::SIN_HW:
5914 case AMDGPUISD::COS_HW: {
5915 // TODO: Need check for infinity
5916 return SNaN;
5917 }
5919 unsigned IntrinsicID = Op.getConstantOperandVal(0);
5920 // TODO: Handle more intrinsics
5921 switch (IntrinsicID) {
5922 case Intrinsic::amdgcn_cubeid:
5923 return true;
5924
5925 case Intrinsic::amdgcn_frexp_mant: {
5926 if (SNaN)
5927 return true;
5928 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
5929 }
5930 case Intrinsic::amdgcn_cvt_pkrtz: {
5931 if (SNaN)
5932 return true;
5933 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
5934 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
5935 }
5936 case Intrinsic::amdgcn_rcp:
5937 case Intrinsic::amdgcn_rsq:
5938 case Intrinsic::amdgcn_rcp_legacy:
5939 case Intrinsic::amdgcn_rsq_legacy:
5940 case Intrinsic::amdgcn_rsq_clamp: {
5941 if (SNaN)
5942 return true;
5943
5944 // TODO: Need is known positive check.
5945 return false;
5946 }
5947 case Intrinsic::amdgcn_trig_preop:
5948 case Intrinsic::amdgcn_fdot2:
5949 // TODO: Refine on operand
5950 return SNaN;
5951 case Intrinsic::amdgcn_fma_legacy:
5952 if (SNaN)
5953 return true;
5954 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
5955 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1) &&
5956 DAG.isKnownNeverNaN(Op.getOperand(3), SNaN, Depth + 1);
5957 default:
5958 return false;
5959 }
5960 }
5961 default:
5962 return false;
5963 }
5964}
5965
5967 Register N0, Register N1) const {
5968 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
5969}
5970
5973 switch (RMW->getOperation()) {
5980 default: {
5981 if (auto *IntTy = dyn_cast<IntegerType>(RMW->getType())) {
5982 unsigned Size = IntTy->getBitWidth();
5983 if (Size == 32 || Size == 64)
5985 }
5986
5988 }
5989 }
5990}
5991
5992/// Whether it is profitable to sink the operands of an
5993/// Instruction I to the basic block of I.
5994/// This helps using several modifiers (like abs and neg) more often.
5996 Instruction *I, SmallVectorImpl<Use *> &Ops) const {
5997 using namespace PatternMatch;
5998
5999 for (auto &Op : I->operands()) {
6000 // Ensure we are not already sinking this operand.
6001 if (any_of(Ops, [&](Use *U) { return U->get() == Op.get(); }))
6002 continue;
6003
6004 if (match(&Op, m_FAbs(m_Value())) || match(&Op, m_FNeg(m_Value())))
6005 Ops.push_back(&Op);
6006 }
6007
6008 return !Ops.empty();
6009}
unsigned const MachineRegisterInfo * MRI
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static LLVM_READONLY bool hasSourceMods(const MachineInstr &MI)
static bool isInv2Pi(const APFloat &APF)
static LLVM_READONLY bool opMustUseVOP3Encoding(const MachineInstr &MI, const MachineRegisterInfo &MRI)
returns true if the operation will definitely need to use a 64-bit encoding, and thus will use a VOP3...
static unsigned inverseMinMax(unsigned Opc)
static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, SelectionDAG &DAG)
static unsigned workitemIntrinsicDim(unsigned ID)
static int getOrCreateFixedStackObject(MachineFrameInfo &MFI, unsigned Size, int64_t Offset)
static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset, uint32_t Width, const SDLoc &DL)
static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X, SDValue Y, SDValue C, SDNodeFlags Flags=SDNodeFlags())
static SDValue getAddOneOp(const SDNode *V)
If V is an add of a constant 1, returns the other operand.
#define NODE_NAME_CASE(node)
static LLVM_READONLY bool selectSupportsSourceMods(const SDNode *N)
Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the type for ISD::SELECT.
static cl::opt< bool > AMDGPUBypassSlowDiv("amdgpu-bypass-slow-div", cl::desc("Skip 64-bit divide for dynamic 32-bit values"), cl::init(true))
static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL, SDValue N0, SDValue N1, unsigned Size, bool Signed)
static bool fnegFoldsIntoOp(const SDNode *N)
static bool isI24(SDValue Op, SelectionDAG &DAG)
static bool isCttzOpc(unsigned Opc)
static bool isU24(SDValue Op, SelectionDAG &DAG)
static SDValue peekFPSignOps(SDValue Val)
static bool valueIsKnownNeverF32Denorm(SDValue Src)
Return true if it's known that Src can never be an f32 denormal value.
static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI, unsigned Op, const SDLoc &SL, SDValue Cond, SDValue N1, SDValue N2)
static SDValue peekFNeg(SDValue Val)
static SDValue simplifyMul24(SDNode *Node24, TargetLowering::DAGCombinerInfo &DCI)
static bool isCtlzOpc(unsigned Opc)
static LLVM_READNONE bool fnegFoldsIntoOpcode(unsigned Opc)
static bool hasVolatileUser(SDNode *Val)
Interface definition of the TargetLowering class that is common to all AMD GPUs.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
AMDGPU promote alloca to vector or LDS
Function Alias Analysis Results
block Block Frequency Analysis
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define LLVM_READNONE
Definition: Compiler.h:220
#define LLVM_READONLY
Definition: Compiler.h:227
static cl::opt< unsigned > CostThreshold("dfa-cost-threshold", cl::desc("Maximum cost accepted for the transformation"), cl::Hidden, cl::init(50))
static Error getAddrSpace(StringRef R, unsigned &AddrSpace)
Definition: DataLayout.cpp:266
uint64_t Size
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
Provides analysis for querying information about KnownBits during GISel passes.
IRTranslator LLVM IR MI
static LVOptions Options
Definition: LVOptions.cpp:25
#define I(x, y, z)
Definition: MD5.cpp:58
#define G(x, y, z)
Definition: MD5.cpp:56
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first found DebugLoc that has a DILocation, given a range of instructions.
LLVMContext & Context
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
const char LLVMTargetMachineRef TM
const SmallVectorImpl< MachineOperand > & Cond
#define CH(x, y, z)
Definition: SHA256.cpp:34
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool Enabled
Definition: Statistic.cpp:46
Value * RHS
Value * LHS
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static bool isUniformMMO(const MachineMemOperand *MMO)
static std::optional< uint32_t > getLDSAbsoluteAddress(const GlobalValue &GV)
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV)
bool hasFminFmaxLegacy() const
Align getAlignmentForImplicitArgPtr() const
bool hasMadMacF32Insts() const
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
bool has16BitInsts() const
bool hasFastFMAF32() const
unsigned getExplicitKernelArgOffset() const
Returns the offset in bytes from the start of the input buffer of the first explicit kernel argument.
static const AMDGPUSubtarget & get(const MachineFunction &MF)
bool hasInv2PiInlineImm() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue combineFMinMaxLegacy(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True, SDValue False, SDValue CC, DAGCombinerInfo &DCI) const
Generate Min/Max node.
unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
This method can be implemented by targets that want to expose additional information about sign bits ...
SDValue performMulhuCombine(SDNode *N, DAGCombinerInfo &DCI) const
EVT getTypeForExtReturn(LLVMContext &Context, EVT VT, ISD::NodeType ExtendKind) const override
Return the type that should be used to zero or sign extend a zeroext/signext integer return value.
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const
SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
bool storeOfVectorConstantIsCheap(bool IsZero, EVT MemVT, unsigned NumElem, unsigned AS) const override
Return true if it is expected to be cheaper to do a store of vector constant with the given size and ...
SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool shouldCombineMemoryType(EVT VT) const
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue performAssertSZExtCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isTruncateFree(EVT Src, EVT Dest) const override
bool aggressivelyPreferBuildVectorSources(EVT VecVT) const override
SDValue LowerFCEIL(SDValue Op, SelectionDAG &DAG) const
TargetLowering::NegatibleCost getConstantNegateCost(const ConstantFPSDNode *C) const
SDValue LowerFLOGUnsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, bool IsLog10, SDNodeFlags Flags) const
SDValue performMulhsCombine(SDNode *N, DAGCombinerInfo &DCI) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool isSDNodeAlwaysUniform(const SDNode *N) const override
bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to move this shift by a constant amount through its operand,...
SDValue LowerFREM(SDValue Op, SelectionDAG &DAG) const
Split a vector store into multiple scalar stores.
SDValue performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isCheapToSpeculateCtlz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic ctlz.
SDValue LowerSDIVREM(SDValue Op, SelectionDAG &DAG) const
bool isFNegFree(EVT VT) const override
Return true if an fneg operation is free to the point where it is never worthwhile to replace it with...
SDValue LowerFLOG10(SDValue Op, SelectionDAG &DAG) const
SDValue LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, bool Signed) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
bool isConstantCheaperToNegate(SDValue N) const
bool isReassocProfitable(MachineRegisterInfo &MRI, Register N0, Register N1) const override
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const
SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue performFNegCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
bool isConstantCostlierToNegate(SDValue N) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
SDValue LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const
SDValue lowerFEXP10Unsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, SDNodeFlags Flags) const
Emit approx-funcs appropriate lowering for exp10.
SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const
bool isCheapToSpeculateCttz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic cttz.
SDValue performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond, SDValue LHS, SDValue RHS, DAGCombinerInfo &DCI) const
SDValue performSraCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isSelectSupported(SelectSupportKind) const override
bool isZExtFree(Type *Src, Type *Dest) const override
Return true if any actual instruction that defines a value of type FromTy implicitly zero-extends the...
SDValue lowerFEXP2(SDValue Op, SelectionDAG &DAG) const
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
SDValue performSrlCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue lowerFEXP(SDValue Op, SelectionDAG &DAG) const
SDValue getIsLtSmallestNormal(SelectionDAG &DAG, SDValue Op, SDNodeFlags Flags) const
bool mayIgnoreSignedZero(SDValue Op) const
SDValue getIsFinite(SelectionDAG &DAG, SDValue Op, SDNodeFlags Flags) const
bool isLoadBitCastBeneficial(EVT, EVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const final
Return true if the following transform is beneficial: fold (conv (load x)) -> (load (conv*)x) On arch...
bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtType, EVT ExtVT) const override
Return true if it is profitable to reduce a load to a smaller type.
MVT getVectorIdxTy(const DataLayout &) const override
Returns the type to be used for the index operand of: ISD::INSERT_VECTOR_ELT, ISD::EXTRACT_VECTOR_ELT...
std::pair< SDValue, SDValue > splitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HighVT, SelectionDAG &DAG) const
Split a vector value into two parts of types LoVT and HiVT.
SDValue LowerFLOGCommon(SDValue Op, SelectionDAG &DAG) const
SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI, SDValue N) const
bool shouldSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Whether it is profitable to sink the operands of an Instruction I to the basic block of I.
SDValue LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG, bool Signed) const
bool isFAbsFree(EVT VT) const override
Return true if an fabs operation is free to the point where it is never worthwhile to replace it with...
SDValue loadStackInputValue(SelectionDAG &DAG, EVT VT, const SDLoc &SL, int64_t Offset) const
Similar to CreateLiveInRegister, except value maybe loaded from a stack slot rather than passed in a ...
bool isNarrowingProfitable(EVT SrcVT, EVT DestVT) const override
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
SDValue LowerFLOG2(SDValue Op, SelectionDAG &DAG) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &RefinementSteps, bool &UseOneConstNR, bool Reciprocal) const override
Hooks for building estimates in place of slower divisions and square roots.
unsigned computeNumSignBitsForTargetInstr(GISelKnownBits &Analysis, Register R, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
This method can be implemented by targets that want to expose additional information about sign bits ...
SDValue performTruncateCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const
static SDValue stripBitcast(SDValue Val)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
SDValue LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const
SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize, NegatibleCost &Cost, unsigned Depth) const override
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &RefinementSteps) const override
Return a reciprocal estimate value for the input operand.
AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI)
SDValue LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const
const char * getTargetNodeName(unsigned Opcode) const override
This method returns the name of a target specific DAG node.
SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
std::pair< SDValue, SDValue > getScaledLogInput(SelectionDAG &DAG, const SDLoc SL, SDValue Op, SDNodeFlags Flags) const
If denormal handling is required return the scaled input to FLOG2, and the check for denormal range.
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold=4)
SDValue LowerFROUNDEVEN(SDValue Op, SelectionDAG &DAG) const
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override
Returns true if the target can instruction select the specified FP immediate natively.
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
SDValue lowerFEXPUnsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, SDNodeFlags Flags) const
SDValue LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
bool ShouldShrinkFPConstant(EVT VT) const override
If true, then instruction selection should seek to shrink the FP constant of the specified type to a ...
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue getLoHalf64(SDValue Op, SelectionDAG &DAG) const
SDValue lowerCTLZResults(SDValue Op, SelectionDAG &DAG) const
SDValue performFAbsCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerFP_TO_INT64(SDValue Op, SelectionDAG &DAG, bool Signed) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
SDValue LowerFRINT(SDValue Op, SelectionDAG &DAG) const
SDValue performIntrinsicWOChainCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const
SDValue performMulLoHiCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
void LowerUDIVREM64(SDValue Op, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results) const
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
std::pair< EVT, EVT > getSplitDestVTs(const EVT &VT, SelectionDAG &DAG) const
Split a vector type into two parts.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
SDValue combineFMinMaxLegacyImpl(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True, SDValue False, SDValue CC, DAGCombinerInfo &DCI) const
bool bitwiseIsEqual(const APFloat &RHS) const
Definition: APFloat.h:1260
opStatus add(const APFloat &RHS, roundingMode RM)
Definition: APFloat.h:1042
const fltSemantics & getSemantics() const
Definition: APFloat.h:1303
opStatus multiply(const APFloat &RHS, roundingMode RM)
Definition: APFloat.h:1060
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
Definition: APFloat.h:1026
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition: APFloat.h:966
Class for arbitrary precision integers.
Definition: APInt.h:76
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1491
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition: APInt.h:1370
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition: APInt.h:1364
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition: APInt.h:236
bool ule(const APInt &RHS) const
Unsigned less or equal comparison.
Definition: APInt.h:1128
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition: APInt.h:284
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:274
void setLowBits(unsigned loBits)
Set the bottom loBits bits.
Definition: APInt.h:1367
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:748
@ FAdd
*p = old + v
Definition: Instructions.h:785
@ FSub
*p = old - v
Definition: Instructions.h:788
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
Definition: Instructions.h:796
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
Definition: Instructions.h:792
@ Nand
*p = ~(old & v)
Definition: Instructions.h:770
BinOp getOperation() const
Definition: Instructions.h:845
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
LLVMContext & getContext() const
void addLoc(const CCValAssign &V)
static CCValAssign getCustomMem(unsigned ValNo, MVT ValVT, int64_t Offset, MVT LocVT, LocInfo HTP)
const APFloat & getValueAPF() const
bool isNegative() const
Return true if the value is negative.
uint64_t getZExtValue() const
This class represents an Operation in the Expression.
bool print(raw_ostream &OS, DIDumpOptions DumpOpts, const DWARFExpression *Expr, DWARFUnit *U) const
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
Diagnostic information for unsupported feature in backend.
iterator_range< arg_iterator > args()
Definition: Function.h:842
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:264
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:656
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
Machine Value Type.
static auto integer_fixedlen_vector_valuetypes()
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
static auto integer_valuetypes()
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
int getObjectIndexBegin() const
Return the minimum frame object index.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Representation of each machine instruction.
Definition: MachineInstr.h:69
A description of a memory reference used in the backend.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOInvariant
The memory access always returns the same value (or traps).
Flags getFlags() const
Return the raw flags of the source value,.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
bool isSimple() const
Returns true if the memory operation is neither atomic or volatile.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getChain() const
bool isInvariant() const
EVT getMemoryVT() const
Return the type of the in-memory value.
LLVMContext & getContext() const
Get the global data context.
Definition: Module.h:301
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.h:293
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
iterator_range< use_iterator > uses()
SDNodeFlags getFlags() const
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
op_iterator op_end() const
op_iterator op_begin() const
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getOpcode() const
unsigned getNumOperands() const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
SIModeRegisterDefaults getMode() const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:225
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
unsigned ComputeMaxSignificantBits(SDValue Op, unsigned Depth=0) const
Get the upper bound on bit size for this Value Op as a signed integer.
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
Definition: SelectionDAG.h:551
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS)
Helper function to make it easier to build Select's if you just have operands and don't want to check...
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
Definition: SelectionDAG.h:478
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
Definition: SelectionDAG.h:828
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
Definition: SelectionDAG.h:472
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
SDValue getRegister(unsigned Reg, EVT VT)
bool isConstantValueOfAnyType(SDValue N) const
SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
static const fltSemantics & EVTToAPFloatSemantics(EVT VT)
Returns an APFloat semantics tag appropriate for the given type.
const TargetMachine & getTarget() const
Definition: SelectionDAG.h:473
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond)
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
SDValue getValueType(EVT)
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
bool isKnownNeverNaN(SDValue Op, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:676
unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:469
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, unsigned Reg, EVT VT)
Definition: SelectionDAG.h:799
SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
Definition: SelectionDAG.h:485
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
Definition: SelectionDAG.h:560
SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL, bool LegalTypes=true)
SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition: SelectionDAG.h:554
std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
bool equals(StringRef RHS) const
equals - Check for string equality, this is more efficient than compare() when the relative ordering ...
Definition: StringRef.h:164
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void addBypassSlowDiv(unsigned int SlowBitWidth, unsigned int FastBitWidth)
Tells the code generator which bitwidths to bypass.
void setMaxLargeFPConvertBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum fp convert the backend supports.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
SelectSupportKind
Enum that describes what type of support for selects the target has.
virtual bool allowsMisalignedMemoryAccesses(EVT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *=nullptr) const
Determine if the target supports unaligned memory accesses.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
virtual EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const
Return the ValueType of the result of SETCC operations.
EVT getShiftAmountTy(EVT LHSTy, const DataLayout &DL, bool LegalTypes=true) const
Returns the type for the shift amount of a shift opcode.
virtual EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const
For types supported by the target, this is an identity function.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
void setSupportsUnalignedAtomics(bool UnalignedSupported)
Sets whether unaligned atomic operations are supported.
void setLibcallName(RTLIB::Libcall Call, const char *Name)
Rename the default libcall routine name for the specified libcall.
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
virtual bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT) const
Return true if it is profitable to reduce a load to a smaller type.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
void setMinCmpXchgSizeInBits(unsigned SizeInBits)
Sets the minimum cmpxchg or ll/sc size supported by the backend.
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
unsigned GatherAllAliasesMaxDepth
Depth that GatherAllAliases should continue looking for chain dependencies when trying to find a more...
NegatibleCost
Enum that specifies when a float negation is beneficial.
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
void setHasMultipleConditionRegisters(bool hasManyRegs=true)
Tells the code generator that the target has multiple (allocatable) condition registers that can be u...
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
void setJumpIsExpensive(bool isExpensive=true)
Tells the code generator not to expand logic operations on comparison predicates into separate sequen...
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
bool ShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const
Check to see if the specified operand of the specified instruction is a constant integer.
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
virtual SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOps, bool OptForSize, NegatibleCost &Cost, unsigned Depth=0) const
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:76
TargetOptions Options
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:330
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
bool isIntrinsicAlwaysUniform(unsigned IntrID)
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
Definition: CallingConv.h:197
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
Definition: CallingConv.h:188
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ AMDGPU_Gfx
Used for AMD graphics targets.
Definition: CallingConv.h:232
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:249
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
Definition: CallingConv.h:206
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
Definition: CallingConv.h:191
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:245
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:194
@ Cold
Attempts to make code in the caller as efficient as possible under the assumption that the call is no...
Definition: CallingConv.h:47
@ SPIR_KERNEL
Used for SPIR kernel functions.
Definition: CallingConv.h:144
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
Definition: CallingConv.h:218
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
Definition: CallingConv.h:213
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition: ISDOpcodes.h:40
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:751
@ CTLZ_ZERO_UNDEF
Definition: ISDOpcodes.h:724
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:251
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition: ISDOpcodes.h:560
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:715
@ ConstantFP
Definition: ISDOpcodes.h:77
@ ADDC
Carry-setting nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:270
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition: ISDOpcodes.h:488
@ FMAXNUM_IEEE
Definition: ISDOpcodes.h:986
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:240
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1038
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition: ISDOpcodes.h:784
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:484
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:791
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition: ISDOpcodes.h:544
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:391
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:256
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
Definition: ISDOpcodes.h:914
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:904
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition: ISDOpcodes.h:230
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
Definition: ISDOpcodes.h:940
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:775
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition: ISDOpcodes.h:723
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:931
@ BRIND
BRIND - Indirect branch.
Definition: ISDOpcodes.h:1059
@ BR_JT
BR_JT - Jumptable branch.
Definition: ISDOpcodes.h:1063
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition: ISDOpcodes.h:501
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
Definition: ISDOpcodes.h:508
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:728
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
Definition: ISDOpcodes.h:1244
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition: ISDOpcodes.h:223
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:652
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:706
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:601
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition: ISDOpcodes.h:574
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
Definition: ISDOpcodes.h:985
@ EntryToken
EntryToken - This is the marker used to indicate the start of a region.
Definition: ISDOpcodes.h:47
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:536
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:203
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:781
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition: ISDOpcodes.h:743
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:972
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
Definition: ISDOpcodes.h:1048
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:799
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:675
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:889
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition: ISDOpcodes.h:737
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:304
@ INLINEASM_BR
INLINEASM_BR - Branching version of inline asm. Used by asm-goto.
Definition: ISDOpcodes.h:1104
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:991
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:837
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:681
@ TRAP
TRAP - Trapping instruction.
Definition: ISDOpcodes.h:1215
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:184
@ ADDE
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:280
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition: ISDOpcodes.h:525
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition: ISDOpcodes.h:52
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
Definition: ISDOpcodes.h:945
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:870
@ INLINEASM
INLINEASM - Represents an inline asm block.
Definition: ISDOpcodes.h:1101
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:787
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition: ISDOpcodes.h:61
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition: ISDOpcodes.h:494
@ AssertZext
Definition: ISDOpcodes.h:62
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:192
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition: ISDOpcodes.h:516
bool isNormalStore(const SDNode *N)
Returns true if the specified node is a non-truncating and unindexed store.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1530
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
Definition: ISDOpcodes.h:1510
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
m_Intrinsic_Ty< Opnd0 >::Ty m_FAbs(const Opnd0 &Op0)
Libcall
RTLIB::Libcall enum - This enum defines all of the runtime library calls the backend can emit.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
constexpr double ln2
Definition: MathExtras.h:33
constexpr double ln10
Definition: MathExtras.h:34
constexpr float log2ef
Definition: MathExtras.h:50
constexpr double log2e
Definition: MathExtras.h:35
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Offset
Definition: DWP.cpp:456
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
bool getAlign(const Function &F, unsigned index, unsigned &align)
ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:372
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition: bit.h:281
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:138
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:156
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:143
raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
CombineLevel
Definition: DAGCombine.h:15
@ AfterLegalizeDAG
Definition: DAGCombine.h:19
@ AfterLegalizeTypes
Definition: DAGCombine.h:17
@ Mul
Product of integers.
@ And
Bitwise or logical AND of integers.
@ Add
Sum of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
DWARFExpression::Operation Op
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition: Analysis.cpp:79
ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:191
@ DS_Warning
bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
APFloat neg(APFloat X)
Returns the negated value of the argument.
Definition: APFloat.h:1387
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition: Alignment.h:208
bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
static const fltSemantics & IEEEsingle() LLVM_READNONE
Definition: APFloat.cpp:249
static constexpr roundingMode rmNearestTiesToEven
Definition: APFloat.h:230
static const fltSemantics & IEEEdouble() LLVM_READNONE
Definition: APFloat.cpp:250
static const fltSemantics & IEEEhalf() LLVM_READNONE
Definition: APFloat.cpp:247
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
MCRegister getRegister() const
unsigned getStackOffset() const
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
static constexpr DenormalMode getPreserveSign()
Extended Value Type.
Definition: ValueTypes.h:34
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition: ValueTypes.h:380
EVT getPow2VectorType(LLVMContext &Context) const
Widens the length of the given vector EVT up to the nearest power of 2 and returns that type.
Definition: ValueTypes.h:462
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:136
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition: ValueTypes.h:73
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition: ValueTypes.h:120
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:146
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:358
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition: ValueTypes.h:233
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:370
EVT getHalfSizedIntegerVT(LLVMContext &Context) const
Finds the smallest simple value type that is greater than or equal to half the width of this EVT.
Definition: ValueTypes.h:415
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition: ValueTypes.h:455
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition: ValueTypes.h:397
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:306
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition: ValueTypes.h:64
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition: ValueTypes.h:366
EVT getRoundIntegerType(LLVMContext &Context) const
Rounds the bit-width of the given integer EVT up to the nearest power of two (and at least to eight),...
Definition: ValueTypes.h:404
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:167
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:313
bool bitsGE(EVT VT) const
Return true if this has no less bits than VT.
Definition: ValueTypes.h:282
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:318
bool isExtended() const
Test if the given EVT is extended (as opposed to being simple).
Definition: ValueTypes.h:141
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:326
bool bitsLE(EVT VT) const
Return true if this has no more bits than VT.
Definition: ValueTypes.h:298
bool isNonNegative() const
Returns true if this value is known to be non-negative.
Definition: KnownBits.h:104
unsigned countMinTrailingZeros() const
Returns the minimum number of trailing zero bits.
Definition: KnownBits.h:238
bool isUnknown() const
Returns true if we don't know any bits.
Definition: KnownBits.h:63
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
Definition: KnownBits.h:157
unsigned getBitWidth() const
Get the bit width of this value.
Definition: KnownBits.h:40
void resetAll()
Resets the known state of all bits.
Definition: KnownBits.h:71
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition: KnownBits.h:292
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition: KnownBits.h:244
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition: KnownBits.h:141
bool isStrictlyPositive() const
Returns true if this value is known to be positive.
Definition: KnownBits.h:110
bool isNegative() const
Returns true if this value is known to be negative.
Definition: KnownBits.h:101
unsigned countMaxSignificantBits() const
Returns the maximum number of bits needed to represent all possible signed values with these known bi...
Definition: KnownBits.h:265
This class contains a discriminated union of information about pointers in memory operands,...
bool isDereferenceable(unsigned Size, LLVMContext &C, const DataLayout &DL) const
Return true if memory region [V, V+Offset+Size) is known to be dereferenceable.
static MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
MachinePointerInfo getWithOffset(int64_t O) const
These are IR-level optimization flags that may be propagated to SDNodes.
void setAllowContract(bool b)
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO)
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...