LLVM 19.0.0git
AMDGPUISelLowering.cpp
Go to the documentation of this file.
1//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This is the parent TargetLowering class for hardware code gen
11/// targets.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPUISelLowering.h"
16#include "AMDGPU.h"
17#include "AMDGPUInstrInfo.h"
24#include "llvm/IR/IntrinsicsAMDGPU.h"
29
30using namespace llvm;
31
32#include "AMDGPUGenCallingConv.inc"
33
35 "amdgpu-bypass-slow-div",
36 cl::desc("Skip 64-bit divide for dynamic 32-bit values"),
37 cl::init(true));
38
39// Find a larger type to do a load / store of a vector with.
41 unsigned StoreSize = VT.getStoreSizeInBits();
42 if (StoreSize <= 32)
43 return EVT::getIntegerVT(Ctx, StoreSize);
44
45 assert(StoreSize % 32 == 0 && "Store size not a multiple of 32");
46 return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
47}
48
51}
52
54 // In order for this to be a signed 24-bit value, bit 23, must
55 // be a sign bit.
56 return DAG.ComputeMaxSignificantBits(Op);
57}
58
60 const AMDGPUSubtarget &STI)
61 : TargetLowering(TM), Subtarget(&STI) {
62 // Lower floating point store/load to integer store/load to reduce the number
63 // of patterns in tablegen.
65 AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
66
68 AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
69
71 AddPromotedToType(ISD::LOAD, MVT::v3f32, MVT::v3i32);
72
74 AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
75
77 AddPromotedToType(ISD::LOAD, MVT::v5f32, MVT::v5i32);
78
80 AddPromotedToType(ISD::LOAD, MVT::v6f32, MVT::v6i32);
81
83 AddPromotedToType(ISD::LOAD, MVT::v7f32, MVT::v7i32);
84
86 AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
87
89 AddPromotedToType(ISD::LOAD, MVT::v9f32, MVT::v9i32);
90
92 AddPromotedToType(ISD::LOAD, MVT::v10f32, MVT::v10i32);
93
95 AddPromotedToType(ISD::LOAD, MVT::v11f32, MVT::v11i32);
96
98 AddPromotedToType(ISD::LOAD, MVT::v12f32, MVT::v12i32);
99
100 setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
101 AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
102
103 setOperationAction(ISD::LOAD, MVT::v32f32, Promote);
104 AddPromotedToType(ISD::LOAD, MVT::v32f32, MVT::v32i32);
105
107 AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
108
110 AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);
111
113 AddPromotedToType(ISD::LOAD, MVT::f64, MVT::v2i32);
114
116 AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32);
117
119 AddPromotedToType(ISD::LOAD, MVT::v3i64, MVT::v6i32);
120
122 AddPromotedToType(ISD::LOAD, MVT::v4i64, MVT::v8i32);
123
125 AddPromotedToType(ISD::LOAD, MVT::v3f64, MVT::v6i32);
126
128 AddPromotedToType(ISD::LOAD, MVT::v4f64, MVT::v8i32);
129
131 AddPromotedToType(ISD::LOAD, MVT::v8i64, MVT::v16i32);
132
134 AddPromotedToType(ISD::LOAD, MVT::v8f64, MVT::v16i32);
135
136 setOperationAction(ISD::LOAD, MVT::v16i64, Promote);
137 AddPromotedToType(ISD::LOAD, MVT::v16i64, MVT::v32i32);
138
139 setOperationAction(ISD::LOAD, MVT::v16f64, Promote);
140 AddPromotedToType(ISD::LOAD, MVT::v16f64, MVT::v32i32);
141
143 AddPromotedToType(ISD::LOAD, MVT::i128, MVT::v4i32);
144
145 // There are no 64-bit extloads. These should be done as a 32-bit extload and
146 // an extension to 64-bit.
147 for (MVT VT : MVT::integer_valuetypes())
149 Expand);
150
151 for (MVT VT : MVT::integer_valuetypes()) {
152 if (VT == MVT::i64)
153 continue;
154
155 for (auto Op : {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}) {
156 setLoadExtAction(Op, VT, MVT::i1, Promote);
157 setLoadExtAction(Op, VT, MVT::i8, Legal);
158 setLoadExtAction(Op, VT, MVT::i16, Legal);
159 setLoadExtAction(Op, VT, MVT::i32, Expand);
160 }
161 }
162
164 for (auto MemVT :
165 {MVT::v2i8, MVT::v4i8, MVT::v2i16, MVT::v3i16, MVT::v4i16})
167 Expand);
168
169 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
170 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand);
171 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
172 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2bf16, Expand);
173 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3f16, Expand);
174 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3bf16, Expand);
175 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
176 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4bf16, Expand);
177 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
178 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8bf16, Expand);
179 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Expand);
180 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16bf16, Expand);
181 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32f16, Expand);
182 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32bf16, Expand);
183
184 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
185 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
186 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f32, Expand);
187 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
188 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand);
189 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f32, Expand);
190
191 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
192 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand);
193 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
194 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2bf16, Expand);
195 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f16, Expand);
196 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3bf16, Expand);
197 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
198 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4bf16, Expand);
199 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
200 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8bf16, Expand);
201 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f16, Expand);
202 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16bf16, Expand);
203
205 AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
206
208 AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
209
211 AddPromotedToType(ISD::STORE, MVT::v3f32, MVT::v3i32);
212
214 AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
215
217 AddPromotedToType(ISD::STORE, MVT::v5f32, MVT::v5i32);
218
220 AddPromotedToType(ISD::STORE, MVT::v6f32, MVT::v6i32);
221
223 AddPromotedToType(ISD::STORE, MVT::v7f32, MVT::v7i32);
224
226 AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
227
229 AddPromotedToType(ISD::STORE, MVT::v9f32, MVT::v9i32);
230
232 AddPromotedToType(ISD::STORE, MVT::v10f32, MVT::v10i32);
233
235 AddPromotedToType(ISD::STORE, MVT::v11f32, MVT::v11i32);
236
238 AddPromotedToType(ISD::STORE, MVT::v12f32, MVT::v12i32);
239
241 AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
242
244 AddPromotedToType(ISD::STORE, MVT::v32f32, MVT::v32i32);
245
247 AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
248
250 AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);
251
253 AddPromotedToType(ISD::STORE, MVT::f64, MVT::v2i32);
254
256 AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
257
259 AddPromotedToType(ISD::STORE, MVT::v3i64, MVT::v6i32);
260
262 AddPromotedToType(ISD::STORE, MVT::v3f64, MVT::v6i32);
263
265 AddPromotedToType(ISD::STORE, MVT::v4i64, MVT::v8i32);
266
268 AddPromotedToType(ISD::STORE, MVT::v4f64, MVT::v8i32);
269
271 AddPromotedToType(ISD::STORE, MVT::v8i64, MVT::v16i32);
272
274 AddPromotedToType(ISD::STORE, MVT::v8f64, MVT::v16i32);
275
277 AddPromotedToType(ISD::STORE, MVT::v16i64, MVT::v32i32);
278
280 AddPromotedToType(ISD::STORE, MVT::v16f64, MVT::v32i32);
281
283 AddPromotedToType(ISD::STORE, MVT::i128, MVT::v4i32);
284
285 setTruncStoreAction(MVT::i64, MVT::i1, Expand);
286 setTruncStoreAction(MVT::i64, MVT::i8, Expand);
287 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
288 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
289
290 setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
291 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Expand);
292 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand);
293 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
294
295 setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
296 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
297 setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
298 setTruncStoreAction(MVT::v3f32, MVT::v3f16, Expand);
299 setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
300 setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
301 setTruncStoreAction(MVT::v16f32, MVT::v16f16, Expand);
302 setTruncStoreAction(MVT::v32f32, MVT::v32f16, Expand);
303
304 setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
305 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
306 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
307
308 setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
309 setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
310
311 setTruncStoreAction(MVT::v3i32, MVT::v3i8, Expand);
312
313 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
314 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
315 setTruncStoreAction(MVT::v3i64, MVT::v3i8, Expand);
316 setTruncStoreAction(MVT::v3i64, MVT::v3i1, Expand);
317 setTruncStoreAction(MVT::v3f64, MVT::v3f32, Expand);
318 setTruncStoreAction(MVT::v3f64, MVT::v3f16, Expand);
319
320 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Expand);
321 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Expand);
322 setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
323 setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);
324
325 setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);
326 setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);
327
328 setTruncStoreAction(MVT::v16f64, MVT::v16f32, Expand);
329 setTruncStoreAction(MVT::v16f64, MVT::v16f16, Expand);
330 setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
331 setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
332 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
333 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
334 setTruncStoreAction(MVT::v16i64, MVT::v16i1, Expand);
335
336 setOperationAction(ISD::Constant, {MVT::i32, MVT::i64}, Legal);
337 setOperationAction(ISD::ConstantFP, {MVT::f32, MVT::f64}, Legal);
338
340
341 // For R600, this is totally unsupported, just custom lower to produce an
342 // error.
344
345 // Library functions. These default to Expand, but we have instructions
346 // for them.
349 MVT::f32, Legal);
350
352 setOperationAction(ISD::FROUND, {MVT::f32, MVT::f64}, Custom);
353
356 Custom);
357
358 setOperationAction(ISD::FNEARBYINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);
359
360 setOperationAction(ISD::FRINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);
361
362 setOperationAction(ISD::FREM, {MVT::f16, MVT::f32, MVT::f64}, Custom);
363
364 if (Subtarget->has16BitInsts())
365 setOperationAction(ISD::IS_FPCLASS, {MVT::f16, MVT::f32, MVT::f64}, Legal);
366 else {
367 setOperationAction(ISD::IS_FPCLASS, {MVT::f32, MVT::f64}, Legal);
369 }
370
372 Custom);
373
374 // FIXME: These IS_FPCLASS vector fp types are marked custom so it reaches
375 // scalarization code. Can be removed when IS_FPCLASS expand isn't called by
376 // default unless marked custom/legal.
379 {MVT::v2f16, MVT::v3f16, MVT::v4f16, MVT::v16f16, MVT::v2f32, MVT::v3f32,
380 MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v16f32,
381 MVT::v2f64, MVT::v3f64, MVT::v4f64, MVT::v8f64, MVT::v16f64},
382 Custom);
383
384 // Expand to fneg + fadd.
386
388 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32,
389 MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
390 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
391 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
392 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
393 Custom);
394
395 // FIXME: Why is v8f16/v8bf16 missing?
398 {MVT::v2f16, MVT::v2bf16, MVT::v2i16, MVT::v4f16, MVT::v4bf16,
399 MVT::v4i16, MVT::v2f32, MVT::v2i32, MVT::v3f32, MVT::v3i32,
400 MVT::v4f32, MVT::v4i32, MVT::v5f32, MVT::v5i32, MVT::v6f32,
401 MVT::v6i32, MVT::v7f32, MVT::v7i32, MVT::v8f32, MVT::v8i32,
402 MVT::v9f32, MVT::v9i32, MVT::v10i32, MVT::v10f32, MVT::v11i32,
403 MVT::v11f32, MVT::v12i32, MVT::v12f32, MVT::v16f16, MVT::v16bf16,
404 MVT::v16i16, MVT::v16f32, MVT::v16i32, MVT::v32f32, MVT::v32i32,
405 MVT::v2f64, MVT::v2i64, MVT::v3f64, MVT::v3i64, MVT::v4f64,
406 MVT::v4i64, MVT::v8f64, MVT::v8i64, MVT::v16f64, MVT::v16i64,
407 MVT::v32i16, MVT::v32f16, MVT::v32bf16},
408 Custom);
409
411 setOperationAction(ISD::FP_TO_FP16, {MVT::f64, MVT::f32}, Custom);
412
413 const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
414 for (MVT VT : ScalarIntVTs) {
415 // These should use [SU]DIVREM, so set them to expand
417 Expand);
418
419 // GPU does not have divrem function for signed or unsigned.
421
422 // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
424
426
427 // AMDGPU uses ADDC/SUBC/ADDE/SUBE
429 }
430
431 // The hardware supports 32-bit FSHR, but not FSHL.
433
434 // The hardware supports 32-bit ROTR, but not ROTL.
435 setOperationAction(ISD::ROTL, {MVT::i32, MVT::i64}, Expand);
437
439
443 MVT::i64, Custom);
445
447 Legal);
448
451 MVT::i64, Custom);
452
453 for (auto VT : {MVT::i8, MVT::i16})
455
456 static const MVT::SimpleValueType VectorIntTypes[] = {
457 MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32,
458 MVT::v9i32, MVT::v10i32, MVT::v11i32, MVT::v12i32};
459
460 for (MVT VT : VectorIntTypes) {
461 // Expand the following operations for the current type by default.
473 ISD::SETCC},
474 VT, Expand);
475 }
476
477 static const MVT::SimpleValueType FloatVectorTypes[] = {
478 MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32,
479 MVT::v9f32, MVT::v10f32, MVT::v11f32, MVT::v12f32};
480
481 for (MVT VT : FloatVectorTypes) {
494 VT, Expand);
495 }
496
497 // This causes using an unrolled select operation rather than expansion with
498 // bit operations. This is in general better, but the alternative using BFI
499 // instructions may be better if the select sources are SGPRs.
501 AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32);
502
504 AddPromotedToType(ISD::SELECT, MVT::v3f32, MVT::v3i32);
505
507 AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32);
508
510 AddPromotedToType(ISD::SELECT, MVT::v5f32, MVT::v5i32);
511
513 AddPromotedToType(ISD::SELECT, MVT::v6f32, MVT::v6i32);
514
516 AddPromotedToType(ISD::SELECT, MVT::v7f32, MVT::v7i32);
517
519 AddPromotedToType(ISD::SELECT, MVT::v9f32, MVT::v9i32);
520
522 AddPromotedToType(ISD::SELECT, MVT::v10f32, MVT::v10i32);
523
525 AddPromotedToType(ISD::SELECT, MVT::v11f32, MVT::v11i32);
526
528 AddPromotedToType(ISD::SELECT, MVT::v12f32, MVT::v12i32);
529
530 // Disable most libcalls.
531 for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I) {
532 if (I < RTLIB::ATOMIC_LOAD || I > RTLIB::ATOMIC_FETCH_NAND_16)
533 setLibcallName(static_cast<RTLIB::Libcall>(I), nullptr);
534 }
535
537 setJumpIsExpensive(true);
538
539 // FIXME: This is only partially true. If we have to do vector compares, any
540 // SGPR pair can be a condition register. If we have a uniform condition, we
541 // are better off doing SALU operations, where there is only one SCC. For now,
542 // we don't have a way of knowing during instruction selection if a condition
543 // will be uniform and we always use vector compares. Assume we are using
544 // vector compares until that is fixed.
546
549
551
552 // We want to find all load dependencies for long chains of stores to enable
553 // merging into very wide vectors. The problem is with vectors with > 4
554 // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
555 // vectors are a legal type, even though we have to split the loads
556 // usually. When we can more precisely specify load legality per address
557 // space, we should be able to make FindBetterChain/MergeConsecutiveStores
558 // smarter so that they can figure out what to do in 2 iterations without all
559 // N > 4 stores on the same chain.
561
562 // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
563 // about these during lowering.
564 MaxStoresPerMemcpy = 0xffffffff;
565 MaxStoresPerMemmove = 0xffffffff;
566 MaxStoresPerMemset = 0xffffffff;
567
568 // The expansion for 64-bit division is enormous.
570 addBypassSlowDiv(64, 32);
571
582
586}
587
589 if (getTargetMachine().Options.NoSignedZerosFPMath)
590 return true;
591
592 const auto Flags = Op.getNode()->getFlags();
593 if (Flags.hasNoSignedZeros())
594 return true;
595
596 return false;
597}
598
599//===----------------------------------------------------------------------===//
600// Target Information
601//===----------------------------------------------------------------------===//
602
604static bool fnegFoldsIntoOpcode(unsigned Opc) {
605 switch (Opc) {
606 case ISD::FADD:
607 case ISD::FSUB:
608 case ISD::FMUL:
609 case ISD::FMA:
610 case ISD::FMAD:
611 case ISD::FMINNUM:
612 case ISD::FMAXNUM:
615 case ISD::FMINIMUM:
616 case ISD::FMAXIMUM:
617 case ISD::SELECT:
618 case ISD::FSIN:
619 case ISD::FTRUNC:
620 case ISD::FRINT:
621 case ISD::FNEARBYINT:
622 case ISD::FROUNDEVEN:
624 case AMDGPUISD::RCP:
631 case AMDGPUISD::FMED3:
632 // TODO: handle llvm.amdgcn.fma.legacy
633 return true;
634 case ISD::BITCAST:
635 llvm_unreachable("bitcast is special cased");
636 default:
637 return false;
638 }
639}
640
641static bool fnegFoldsIntoOp(const SDNode *N) {
642 unsigned Opc = N->getOpcode();
643 if (Opc == ISD::BITCAST) {
644 // TODO: Is there a benefit to checking the conditions performFNegCombine
645 // does? We don't for the other cases.
646 SDValue BCSrc = N->getOperand(0);
647 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
648 return BCSrc.getNumOperands() == 2 &&
649 BCSrc.getOperand(1).getValueSizeInBits() == 32;
650 }
651
652 return BCSrc.getOpcode() == ISD::SELECT && BCSrc.getValueType() == MVT::f32;
653 }
654
655 return fnegFoldsIntoOpcode(Opc);
656}
657
658/// \p returns true if the operation will definitely need to use a 64-bit
659/// encoding, and thus will use a VOP3 encoding regardless of the source
660/// modifiers.
662static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
663 return (N->getNumOperands() > 2 && N->getOpcode() != ISD::SELECT) ||
664 VT == MVT::f64;
665}
666
667/// Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the
668/// type for ISD::SELECT.
670static bool selectSupportsSourceMods(const SDNode *N) {
671 // TODO: Only applies if select will be vector
672 return N->getValueType(0) == MVT::f32;
673}
674
675// Most FP instructions support source modifiers, but this could be refined
676// slightly.
678static bool hasSourceMods(const SDNode *N) {
679 if (isa<MemSDNode>(N))
680 return false;
681
682 switch (N->getOpcode()) {
683 case ISD::CopyToReg:
684 case ISD::FDIV:
685 case ISD::FREM:
686 case ISD::INLINEASM:
690
691 // TODO: Should really be looking at the users of the bitcast. These are
692 // problematic because bitcasts are used to legalize all stores to integer
693 // types.
694 case ISD::BITCAST:
695 return false;
697 switch (N->getConstantOperandVal(0)) {
698 case Intrinsic::amdgcn_interp_p1:
699 case Intrinsic::amdgcn_interp_p2:
700 case Intrinsic::amdgcn_interp_mov:
701 case Intrinsic::amdgcn_interp_p1_f16:
702 case Intrinsic::amdgcn_interp_p2_f16:
703 return false;
704 default:
705 return true;
706 }
707 }
708 case ISD::SELECT:
710 default:
711 return true;
712 }
713}
714
716 unsigned CostThreshold) {
717 // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
718 // it is truly free to use a source modifier in all cases. If there are
719 // multiple users but for each one will necessitate using VOP3, there will be
720 // a code size increase. Try to avoid increasing code size unless we know it
721 // will save on the instruction count.
722 unsigned NumMayIncreaseSize = 0;
723 MVT VT = N->getValueType(0).getScalarType().getSimpleVT();
724
725 assert(!N->use_empty());
726
727 // XXX - Should this limit number of uses to check?
728 for (const SDNode *U : N->uses()) {
729 if (!hasSourceMods(U))
730 return false;
731
732 if (!opMustUseVOP3Encoding(U, VT)) {
733 if (++NumMayIncreaseSize > CostThreshold)
734 return false;
735 }
736 }
737
738 return true;
739}
740
742 ISD::NodeType ExtendKind) const {
743 assert(!VT.isVector() && "only scalar expected");
744
745 // Round to the next multiple of 32-bits.
746 unsigned Size = VT.getSizeInBits();
747 if (Size <= 32)
748 return MVT::i32;
749 return EVT::getIntegerVT(Context, 32 * ((Size + 31) / 32));
750}
751
753 return MVT::i32;
754}
755
757 return true;
758}
759
760// The backend supports 32 and 64 bit floating point immediates.
761// FIXME: Why are we reporting vectors of FP immediates as legal?
763 bool ForCodeSize) const {
764 EVT ScalarVT = VT.getScalarType();
765 return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 ||
766 (ScalarVT == MVT::f16 && Subtarget->has16BitInsts()));
767}
768
769// We don't want to shrink f64 / f32 constants.
771 EVT ScalarVT = VT.getScalarType();
772 return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
773}
774
776 ISD::LoadExtType ExtTy,
777 EVT NewVT) const {
778 // TODO: This may be worth removing. Check regression tests for diffs.
780 return false;
781
782 unsigned NewSize = NewVT.getStoreSizeInBits();
783
784 // If we are reducing to a 32-bit load or a smaller multi-dword load,
785 // this is always better.
786 if (NewSize >= 32)
787 return true;
788
789 EVT OldVT = N->getValueType(0);
790 unsigned OldSize = OldVT.getStoreSizeInBits();
791
792 MemSDNode *MN = cast<MemSDNode>(N);
793 unsigned AS = MN->getAddressSpace();
794 // Do not shrink an aligned scalar load to sub-dword.
795 // Scalar engine cannot do sub-dword loads.
796 // TODO: Update this for GFX12 which does have scalar sub-dword loads.
797 if (OldSize >= 32 && NewSize < 32 && MN->getAlign() >= Align(4) &&
800 (isa<LoadSDNode>(N) && AS == AMDGPUAS::GLOBAL_ADDRESS &&
801 MN->isInvariant())) &&
803 return false;
804
805 // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
806 // extloads, so doing one requires using a buffer_load. In cases where we
807 // still couldn't use a scalar load, using the wider load shouldn't really
808 // hurt anything.
809
810 // If the old size already had to be an extload, there's no harm in continuing
811 // to reduce the width.
812 return (OldSize < 32);
813}
814
816 const SelectionDAG &DAG,
817 const MachineMemOperand &MMO) const {
818
819 assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());
820
821 if (LoadTy.getScalarType() == MVT::i32)
822 return false;
823
824 unsigned LScalarSize = LoadTy.getScalarSizeInBits();
825 unsigned CastScalarSize = CastTy.getScalarSizeInBits();
826
827 if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32))
828 return false;
829
830 unsigned Fast = 0;
832 CastTy, MMO, &Fast) &&
833 Fast;
834}
835
836// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
837// profitable with the expansion for 64-bit since it's generally good to
838// speculate things.
840 return true;
841}
842
844 return true;
845}
846
848 switch (N->getOpcode()) {
849 case ISD::EntryToken:
850 case ISD::TokenFactor:
851 return true;
853 unsigned IntrID = N->getConstantOperandVal(0);
854 switch (IntrID) {
855 case Intrinsic::amdgcn_readfirstlane:
856 case Intrinsic::amdgcn_readlane:
857 return true;
858 }
859 return false;
860 }
861 case ISD::LOAD:
862 if (cast<LoadSDNode>(N)->getMemOperand()->getAddrSpace() ==
864 return true;
865 return false;
866 case AMDGPUISD::SETCC: // ballot-style instruction
867 return true;
868 }
869 return false;
870}
871
873 SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize,
874 NegatibleCost &Cost, unsigned Depth) const {
875
876 switch (Op.getOpcode()) {
877 case ISD::FMA:
878 case ISD::FMAD: {
879 // Negating a fma is not free if it has users without source mods.
880 if (!allUsesHaveSourceMods(Op.getNode()))
881 return SDValue();
882 break;
883 }
884 case AMDGPUISD::RCP: {
885 SDValue Src = Op.getOperand(0);
886 EVT VT = Op.getValueType();
887 SDLoc SL(Op);
888
889 SDValue NegSrc = getNegatedExpression(Src, DAG, LegalOperations,
890 ForCodeSize, Cost, Depth + 1);
891 if (NegSrc)
892 return DAG.getNode(AMDGPUISD::RCP, SL, VT, NegSrc, Op->getFlags());
893 return SDValue();
894 }
895 default:
896 break;
897 }
898
899 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
900 ForCodeSize, Cost, Depth);
901}
902
903//===---------------------------------------------------------------------===//
904// Target Properties
905//===---------------------------------------------------------------------===//
906
909
910 // Packed operations do not have a fabs modifier.
911 return VT == MVT::f32 || VT == MVT::f64 ||
912 (Subtarget->has16BitInsts() && VT == MVT::f16);
913}
914
917 // Report this based on the end legalized type.
918 VT = VT.getScalarType();
919 return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16;
920}
921
923 unsigned NumElem,
924 unsigned AS) const {
925 return true;
926}
927
929 // There are few operations which truly have vector input operands. Any vector
930 // operation is going to involve operations on each component, and a
931 // build_vector will be a copy per element, so it always makes sense to use a
932 // build_vector input in place of the extracted element to avoid a copy into a
933 // super register.
934 //
935 // We should probably only do this if all users are extracts only, but this
936 // should be the common case.
937 return true;
938}
939
941 // Truncate is just accessing a subregister.
942
943 unsigned SrcSize = Source.getSizeInBits();
944 unsigned DestSize = Dest.getSizeInBits();
945
946 return DestSize < SrcSize && DestSize % 32 == 0 ;
947}
948
950 // Truncate is just accessing a subregister.
951
952 unsigned SrcSize = Source->getScalarSizeInBits();
953 unsigned DestSize = Dest->getScalarSizeInBits();
954
955 if (DestSize== 16 && Subtarget->has16BitInsts())
956 return SrcSize >= 32;
957
958 return DestSize < SrcSize && DestSize % 32 == 0;
959}
960
962 unsigned SrcSize = Src->getScalarSizeInBits();
963 unsigned DestSize = Dest->getScalarSizeInBits();
964
965 if (SrcSize == 16 && Subtarget->has16BitInsts())
966 return DestSize >= 32;
967
968 return SrcSize == 32 && DestSize == 64;
969}
970
972 // Any register load of a 64-bit value really requires 2 32-bit moves. For all
973 // practical purposes, the extra mov 0 to load a 64-bit is free. As used,
974 // this will enable reducing 64-bit operations the 32-bit, which is always
975 // good.
976
977 if (Src == MVT::i16)
978 return Dest == MVT::i32 ||Dest == MVT::i64 ;
979
980 return Src == MVT::i32 && Dest == MVT::i64;
981}
982
984 // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
985 // limited number of native 64-bit operations. Shrinking an operation to fit
986 // in a single 32-bit register should always be helpful. As currently used,
987 // this is much less general than the name suggests, and is only used in
988 // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
989 // not profitable, and may actually be harmful.
990 return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
991}
992
994 const SDNode* N, CombineLevel Level) const {
995 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
996 N->getOpcode() == ISD::SRL) &&
997 "Expected shift op");
998 // Always commute pre-type legalization and right shifts.
999 // We're looking for shl(or(x,y),z) patterns.
1001 N->getOpcode() != ISD::SHL || N->getOperand(0).getOpcode() != ISD::OR)
1002 return true;
1003
1004 // If only user is a i32 right-shift, then don't destroy a BFE pattern.
1005 if (N->getValueType(0) == MVT::i32 && N->use_size() == 1 &&
1006 (N->use_begin()->getOpcode() == ISD::SRA ||
1007 N->use_begin()->getOpcode() == ISD::SRL))
1008 return false;
1009
1010 // Don't destroy or(shl(load_zext(),c), load_zext()) patterns.
1011 auto IsShiftAndLoad = [](SDValue LHS, SDValue RHS) {
1012 if (LHS.getOpcode() != ISD::SHL)
1013 return false;
1014 auto *RHSLd = dyn_cast<LoadSDNode>(RHS);
1015 auto *LHS0 = dyn_cast<LoadSDNode>(LHS.getOperand(0));
1016 auto *LHS1 = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
1017 return LHS0 && LHS1 && RHSLd && LHS0->getExtensionType() == ISD::ZEXTLOAD &&
1018 LHS1->getAPIntValue() == LHS0->getMemoryVT().getScalarSizeInBits() &&
1019 RHSLd->getExtensionType() == ISD::ZEXTLOAD;
1020 };
1021 SDValue LHS = N->getOperand(0).getOperand(0);
1022 SDValue RHS = N->getOperand(0).getOperand(1);
1023 return !(IsShiftAndLoad(LHS, RHS) || IsShiftAndLoad(RHS, LHS));
1024}
1025
1026//===---------------------------------------------------------------------===//
1027// TargetLowering Callbacks
1028//===---------------------------------------------------------------------===//
1029
1031 bool IsVarArg) {
1032 switch (CC) {
1040 return CC_AMDGPU;
1043 return CC_AMDGPU_CS_CHAIN;
1044 case CallingConv::C:
1045 case CallingConv::Fast:
1046 case CallingConv::Cold:
1047 return CC_AMDGPU_Func;
1049 return CC_SI_Gfx;
1052 default:
1053 report_fatal_error("Unsupported calling convention for call");
1054 }
1055}
1056
1058 bool IsVarArg) {
1059 switch (CC) {
1062 llvm_unreachable("kernels should not be handled here");
1072 return RetCC_SI_Shader;
1074 return RetCC_SI_Gfx;
1075 case CallingConv::C:
1076 case CallingConv::Fast:
1077 case CallingConv::Cold:
1078 return RetCC_AMDGPU_Func;
1079 default:
1080 report_fatal_error("Unsupported calling convention.");
1081 }
1082}
1083
1084/// The SelectionDAGBuilder will automatically promote function arguments
1085/// with illegal types. However, this does not work for the AMDGPU targets
1086/// since the function arguments are stored in memory as these illegal types.
1087/// In order to handle this properly we need to get the original types sizes
1088/// from the LLVM IR Function and fixup the ISD:InputArg values before
1089/// passing them to AnalyzeFormalArguments()
1090
1091/// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
1092/// input values across multiple registers. Each item in the Ins array
1093/// represents a single value that will be stored in registers. Ins[x].VT is
1094/// the value type of the value that will be stored in the register, so
1095/// whatever SDNode we lower the argument to needs to be this type.
1096///
1097/// In order to correctly lower the arguments we need to know the size of each
1098/// argument. Since Ins[x].VT gives us the size of the register that will
1099/// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
1100/// for the original function argument so that we can deduce the correct memory
1101/// type to use for Ins[x]. In most cases the correct memory type will be
1102/// Ins[x].ArgVT. However, this will not always be the case. If, for example,
1103/// we have a kernel argument of type v8i8, this argument will be split into
1104/// 8 parts and each part will be represented by its own item in the Ins array.
1105/// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
1106/// the argument before it was split. From this, we deduce that the memory type
1107/// for each individual part is i8. We pass the memory type as LocVT to the
1108/// calling convention analysis function and the register type (Ins[x].VT) as
1109/// the ValVT.
1111 CCState &State,
1112 const SmallVectorImpl<ISD::InputArg> &Ins) const {
1113 const MachineFunction &MF = State.getMachineFunction();
1114 const Function &Fn = MF.getFunction();
1115 LLVMContext &Ctx = Fn.getParent()->getContext();
1116 const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF);
1117 const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset();
1119
1120 Align MaxAlign = Align(1);
1121 uint64_t ExplicitArgOffset = 0;
1122 const DataLayout &DL = Fn.getParent()->getDataLayout();
1123
1124 unsigned InIndex = 0;
1125
1126 for (const Argument &Arg : Fn.args()) {
1127 const bool IsByRef = Arg.hasByRefAttr();
1128 Type *BaseArgTy = Arg.getType();
1129 Type *MemArgTy = IsByRef ? Arg.getParamByRefType() : BaseArgTy;
1130 Align Alignment = DL.getValueOrABITypeAlignment(
1131 IsByRef ? Arg.getParamAlign() : std::nullopt, MemArgTy);
1132 MaxAlign = std::max(Alignment, MaxAlign);
1133 uint64_t AllocSize = DL.getTypeAllocSize(MemArgTy);
1134
1135 uint64_t ArgOffset = alignTo(ExplicitArgOffset, Alignment) + ExplicitOffset;
1136 ExplicitArgOffset = alignTo(ExplicitArgOffset, Alignment) + AllocSize;
1137
1138 // We're basically throwing away everything passed into us and starting over
1139 // to get accurate in-memory offsets. The "PartOffset" is completely useless
1140 // to us as computed in Ins.
1141 //
1142 // We also need to figure out what type legalization is trying to do to get
1143 // the correct memory offsets.
1144
1145 SmallVector<EVT, 16> ValueVTs;
1147 ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, &Offsets, ArgOffset);
1148
1149 for (unsigned Value = 0, NumValues = ValueVTs.size();
1150 Value != NumValues; ++Value) {
1151 uint64_t BasePartOffset = Offsets[Value];
1152
1153 EVT ArgVT = ValueVTs[Value];
1154 EVT MemVT = ArgVT;
1155 MVT RegisterVT = getRegisterTypeForCallingConv(Ctx, CC, ArgVT);
1156 unsigned NumRegs = getNumRegistersForCallingConv(Ctx, CC, ArgVT);
1157
1158 if (NumRegs == 1) {
1159 // This argument is not split, so the IR type is the memory type.
1160 if (ArgVT.isExtended()) {
1161 // We have an extended type, like i24, so we should just use the
1162 // register type.
1163 MemVT = RegisterVT;
1164 } else {
1165 MemVT = ArgVT;
1166 }
1167 } else if (ArgVT.isVector() && RegisterVT.isVector() &&
1168 ArgVT.getScalarType() == RegisterVT.getScalarType()) {
1169 assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements());
1170 // We have a vector value which has been split into a vector with
1171 // the same scalar type, but fewer elements. This should handle
1172 // all the floating-point vector types.
1173 MemVT = RegisterVT;
1174 } else if (ArgVT.isVector() &&
1175 ArgVT.getVectorNumElements() == NumRegs) {
1176 // This arg has been split so that each element is stored in a separate
1177 // register.
1178 MemVT = ArgVT.getScalarType();
1179 } else if (ArgVT.isExtended()) {
1180 // We have an extended type, like i65.
1181 MemVT = RegisterVT;
1182 } else {
1183 unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs;
1184 assert(ArgVT.getStoreSizeInBits() % NumRegs == 0);
1185 if (RegisterVT.isInteger()) {
1186 MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
1187 } else if (RegisterVT.isVector()) {
1188 assert(!RegisterVT.getScalarType().isFloatingPoint());
1189 unsigned NumElements = RegisterVT.getVectorNumElements();
1190 assert(MemoryBits % NumElements == 0);
1191 // This vector type has been split into another vector type with
1192 // a different elements size.
1193 EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
1194 MemoryBits / NumElements);
1195 MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
1196 } else {
1197 llvm_unreachable("cannot deduce memory type.");
1198 }
1199 }
1200
1201 // Convert one element vectors to scalar.
1202 if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
1203 MemVT = MemVT.getScalarType();
1204
1205 // Round up vec3/vec5 argument.
1206 if (MemVT.isVector() && !MemVT.isPow2VectorType()) {
1207 assert(MemVT.getVectorNumElements() == 3 ||
1208 MemVT.getVectorNumElements() == 5 ||
1209 (MemVT.getVectorNumElements() >= 9 &&
1210 MemVT.getVectorNumElements() <= 12));
1211 MemVT = MemVT.getPow2VectorType(State.getContext());
1212 } else if (!MemVT.isSimple() && !MemVT.isVector()) {
1213 MemVT = MemVT.getRoundIntegerType(State.getContext());
1214 }
1215
1216 unsigned PartOffset = 0;
1217 for (unsigned i = 0; i != NumRegs; ++i) {
1218 State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT,
1219 BasePartOffset + PartOffset,
1220 MemVT.getSimpleVT(),
1222 PartOffset += MemVT.getStoreSize();
1223 }
1224 }
1225 }
1226}
1227
1229 SDValue Chain, CallingConv::ID CallConv,
1230 bool isVarArg,
1232 const SmallVectorImpl<SDValue> &OutVals,
1233 const SDLoc &DL, SelectionDAG &DAG) const {
1234 // FIXME: Fails for r600 tests
1235 //assert(!isVarArg && Outs.empty() && OutVals.empty() &&
1236 // "wave terminate should not have return values");
1237 return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
1238}
1239
1240//===---------------------------------------------------------------------===//
1241// Target specific lowering
1242//===---------------------------------------------------------------------===//
1243
1244/// Selects the correct CCAssignFn for a given CallingConvention value.
1246 bool IsVarArg) {
1247 return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg);
1248}
1249
1251 bool IsVarArg) {
1253}
1254
1256 SelectionDAG &DAG,
1257 MachineFrameInfo &MFI,
1258 int ClobberedFI) const {
1259 SmallVector<SDValue, 8> ArgChains;
1260 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
1261 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
1262
1263 // Include the original chain at the beginning of the list. When this is
1264 // used by target LowerCall hooks, this helps legalize find the
1265 // CALLSEQ_BEGIN node.
1266 ArgChains.push_back(Chain);
1267
1268 // Add a chain value for each stack argument corresponding
1269 for (SDNode *U : DAG.getEntryNode().getNode()->uses()) {
1270 if (LoadSDNode *L = dyn_cast<LoadSDNode>(U)) {
1271 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) {
1272 if (FI->getIndex() < 0) {
1273 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
1274 int64_t InLastByte = InFirstByte;
1275 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
1276
1277 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
1278 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
1279 ArgChains.push_back(SDValue(L, 1));
1280 }
1281 }
1282 }
1283 }
1284
1285 // Build a tokenfactor for all the chains.
1286 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
1287}
1288
1291 StringRef Reason) const {
1292 SDValue Callee = CLI.Callee;
1293 SelectionDAG &DAG = CLI.DAG;
1294
1295 const Function &Fn = DAG.getMachineFunction().getFunction();
1296
1297 StringRef FuncName("<unknown>");
1298
1299 if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee))
1300 FuncName = G->getSymbol();
1301 else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
1302 FuncName = G->getGlobal()->getName();
1303
1305 Fn, Reason + FuncName, CLI.DL.getDebugLoc());
1306 DAG.getContext()->diagnose(NoCalls);
1307
1308 if (!CLI.IsTailCall) {
1309 for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
1310 InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT));
1311 }
1312
1313 return DAG.getEntryNode();
1314}
1315
1317 SmallVectorImpl<SDValue> &InVals) const {
1318 return lowerUnhandledCall(CLI, InVals, "unsupported call to function ");
1319}
1320
1322 SelectionDAG &DAG) const {
1323 const Function &Fn = DAG.getMachineFunction().getFunction();
1324
1325 DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "unsupported dynamic alloca",
1326 SDLoc(Op).getDebugLoc());
1327 DAG.getContext()->diagnose(NoDynamicAlloca);
1328 auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
1329 return DAG.getMergeValues(Ops, SDLoc());
1330}
1331
1333 SelectionDAG &DAG) const {
1334 switch (Op.getOpcode()) {
1335 default:
1336 Op->print(errs(), &DAG);
1337 llvm_unreachable("Custom lowering code for this "
1338 "instruction is not implemented yet!");
1339 break;
1341 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
1343 case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
1344 case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
1345 case ISD::FREM: return LowerFREM(Op, DAG);
1346 case ISD::FCEIL: return LowerFCEIL(Op, DAG);
1347 case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
1348 case ISD::FRINT: return LowerFRINT(Op, DAG);
1349 case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
1350 case ISD::FROUNDEVEN:
1351 return LowerFROUNDEVEN(Op, DAG);
1352 case ISD::FROUND: return LowerFROUND(Op, DAG);
1353 case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
1354 case ISD::FLOG2:
1355 return LowerFLOG2(Op, DAG);
1356 case ISD::FLOG:
1357 case ISD::FLOG10:
1358 return LowerFLOGCommon(Op, DAG);
1359 case ISD::FEXP:
1360 case ISD::FEXP10:
1361 return lowerFEXP(Op, DAG);
1362 case ISD::FEXP2:
1363 return lowerFEXP2(Op, DAG);
1364 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
1365 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
1366 case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
1367 case ISD::FP_TO_SINT:
1368 case ISD::FP_TO_UINT:
1369 return LowerFP_TO_INT(Op, DAG);
1370 case ISD::CTTZ:
1372 case ISD::CTLZ:
1374 return LowerCTLZ_CTTZ(Op, DAG);
1376 }
1377 return Op;
1378}
1379
1382 SelectionDAG &DAG) const {
1383 switch (N->getOpcode()) {
1385 // Different parts of legalization seem to interpret which type of
1386 // sign_extend_inreg is the one to check for custom lowering. The extended
1387 // from type is what really matters, but some places check for custom
1388 // lowering of the result type. This results in trying to use
1389 // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
1390 // nothing here and let the illegal result integer be handled normally.
1391 return;
1392 case ISD::FLOG2:
1393 if (SDValue Lowered = LowerFLOG2(SDValue(N, 0), DAG))
1394 Results.push_back(Lowered);
1395 return;
1396 case ISD::FLOG:
1397 case ISD::FLOG10:
1398 if (SDValue Lowered = LowerFLOGCommon(SDValue(N, 0), DAG))
1399 Results.push_back(Lowered);
1400 return;
1401 case ISD::FEXP2:
1402 if (SDValue Lowered = lowerFEXP2(SDValue(N, 0), DAG))
1403 Results.push_back(Lowered);
1404 return;
1405 case ISD::FEXP:
1406 case ISD::FEXP10:
1407 if (SDValue Lowered = lowerFEXP(SDValue(N, 0), DAG))
1408 Results.push_back(Lowered);
1409 return;
1410 case ISD::CTLZ:
1412 if (auto Lowered = lowerCTLZResults(SDValue(N, 0u), DAG))
1413 Results.push_back(Lowered);
1414 return;
1415 default:
1416 return;
1417 }
1418}
1419
1421 SDValue Op,
1422 SelectionDAG &DAG) const {
1423
1424 const DataLayout &DL = DAG.getDataLayout();
1425 GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
1426 const GlobalValue *GV = G->getGlobal();
1427
1428 if (!MFI->isModuleEntryFunction()) {
1429 if (std::optional<uint32_t> Address =
1431 return DAG.getConstant(*Address, SDLoc(Op), Op.getValueType());
1432 }
1433 }
1434
1435 if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
1436 G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
1437 if (!MFI->isModuleEntryFunction() &&
1438 !GV->getName().equals("llvm.amdgcn.module.lds")) {
1439 SDLoc DL(Op);
1440 const Function &Fn = DAG.getMachineFunction().getFunction();
1441 DiagnosticInfoUnsupported BadLDSDecl(
1442 Fn, "local memory global used by non-kernel function",
1443 DL.getDebugLoc(), DS_Warning);
1444 DAG.getContext()->diagnose(BadLDSDecl);
1445
1446 // We currently don't have a way to correctly allocate LDS objects that
1447 // aren't directly associated with a kernel. We do force inlining of
1448 // functions that use local objects. However, if these dead functions are
1449 // not eliminated, we don't want a compile time error. Just emit a warning
1450 // and a trap, since there should be no callable path here.
1451 SDValue Trap = DAG.getNode(ISD::TRAP, DL, MVT::Other, DAG.getEntryNode());
1452 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
1453 Trap, DAG.getRoot());
1454 DAG.setRoot(OutputChain);
1455 return DAG.getUNDEF(Op.getValueType());
1456 }
1457
1458 // XXX: What does the value of G->getOffset() mean?
1459 assert(G->getOffset() == 0 &&
1460 "Do not know what to do with an non-zero offset");
1461
1462 // TODO: We could emit code to handle the initialization somewhere.
1463 // We ignore the initializer for now and legalize it to allow selection.
1464 // The initializer will anyway get errored out during assembly emission.
1465 unsigned Offset = MFI->allocateLDSGlobal(DL, *cast<GlobalVariable>(GV));
1466 return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
1467 }
1468 return SDValue();
1469}
1470
1472 SelectionDAG &DAG) const {
1474 SDLoc SL(Op);
1475
1476 EVT VT = Op.getValueType();
1477 if (VT.getVectorElementType().getSizeInBits() < 32) {
1478 unsigned OpBitSize = Op.getOperand(0).getValueType().getSizeInBits();
1479 if (OpBitSize >= 32 && OpBitSize % 32 == 0) {
1480 unsigned NewNumElt = OpBitSize / 32;
1481 EVT NewEltVT = (NewNumElt == 1) ? MVT::i32
1483 MVT::i32, NewNumElt);
1484 for (const SDUse &U : Op->ops()) {
1485 SDValue In = U.get();
1486 SDValue NewIn = DAG.getNode(ISD::BITCAST, SL, NewEltVT, In);
1487 if (NewNumElt > 1)
1488 DAG.ExtractVectorElements(NewIn, Args);
1489 else
1490 Args.push_back(NewIn);
1491 }
1492
1493 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
1494 NewNumElt * Op.getNumOperands());
1495 SDValue BV = DAG.getBuildVector(NewVT, SL, Args);
1496 return DAG.getNode(ISD::BITCAST, SL, VT, BV);
1497 }
1498 }
1499
1500 for (const SDUse &U : Op->ops())
1501 DAG.ExtractVectorElements(U.get(), Args);
1502
1503 return DAG.getBuildVector(Op.getValueType(), SL, Args);
1504}
1505
1507 SelectionDAG &DAG) const {
1508 SDLoc SL(Op);
1510 unsigned Start = Op.getConstantOperandVal(1);
1511 EVT VT = Op.getValueType();
1512 EVT SrcVT = Op.getOperand(0).getValueType();
1513
1514 if (VT.getScalarSizeInBits() == 16 && Start % 2 == 0) {
1515 unsigned NumElt = VT.getVectorNumElements();
1516 unsigned NumSrcElt = SrcVT.getVectorNumElements();
1517 assert(NumElt % 2 == 0 && NumSrcElt % 2 == 0 && "expect legal types");
1518
1519 // Extract 32-bit registers at a time.
1520 EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumSrcElt / 2);
1521 EVT NewVT = NumElt == 2
1522 ? MVT::i32
1523 : EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElt / 2);
1524 SDValue Tmp = DAG.getNode(ISD::BITCAST, SL, NewSrcVT, Op.getOperand(0));
1525
1526 DAG.ExtractVectorElements(Tmp, Args, Start / 2, NumElt / 2);
1527 if (NumElt == 2)
1528 Tmp = Args[0];
1529 else
1530 Tmp = DAG.getBuildVector(NewVT, SL, Args);
1531
1532 return DAG.getNode(ISD::BITCAST, SL, VT, Tmp);
1533 }
1534
1535 DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
1537
1538 return DAG.getBuildVector(Op.getValueType(), SL, Args);
1539}
1540
1541// TODO: Handle fabs too
1543 if (Val.getOpcode() == ISD::FNEG)
1544 return Val.getOperand(0);
1545
1546 return Val;
1547}
1548
1550 if (Val.getOpcode() == ISD::FNEG)
1551 Val = Val.getOperand(0);
1552 if (Val.getOpcode() == ISD::FABS)
1553 Val = Val.getOperand(0);
1554 if (Val.getOpcode() == ISD::FCOPYSIGN)
1555 Val = Val.getOperand(0);
1556 return Val;
1557}
1558
1560 const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True,
1561 SDValue False, SDValue CC, DAGCombinerInfo &DCI) const {
1562 SelectionDAG &DAG = DCI.DAG;
1563 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1564 switch (CCOpcode) {
1565 case ISD::SETOEQ:
1566 case ISD::SETONE:
1567 case ISD::SETUNE:
1568 case ISD::SETNE:
1569 case ISD::SETUEQ:
1570 case ISD::SETEQ:
1571 case ISD::SETFALSE:
1572 case ISD::SETFALSE2:
1573 case ISD::SETTRUE:
1574 case ISD::SETTRUE2:
1575 case ISD::SETUO:
1576 case ISD::SETO:
1577 break;
1578 case ISD::SETULE:
1579 case ISD::SETULT: {
1580 if (LHS == True)
1581 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1582 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1583 }
1584 case ISD::SETOLE:
1585 case ISD::SETOLT:
1586 case ISD::SETLE:
1587 case ISD::SETLT: {
1588 // Ordered. Assume ordered for undefined.
1589
1590 // Only do this after legalization to avoid interfering with other combines
1591 // which might occur.
1593 !DCI.isCalledByLegalizer())
1594 return SDValue();
1595
1596 // We need to permute the operands to get the correct NaN behavior. The
1597 // selected operand is the second one based on the failing compare with NaN,
1598 // so permute it based on the compare type the hardware uses.
1599 if (LHS == True)
1600 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1601 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1602 }
1603 case ISD::SETUGE:
1604 case ISD::SETUGT: {
1605 if (LHS == True)
1606 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1607 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1608 }
1609 case ISD::SETGT:
1610 case ISD::SETGE:
1611 case ISD::SETOGE:
1612 case ISD::SETOGT: {
1614 !DCI.isCalledByLegalizer())
1615 return SDValue();
1616
1617 if (LHS == True)
1618 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1619 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1620 }
1621 case ISD::SETCC_INVALID:
1622 llvm_unreachable("Invalid setcc condcode!");
1623 }
1624 return SDValue();
1625}
1626
1627/// Generate Min/Max node
1629 SDValue LHS, SDValue RHS,
1630 SDValue True, SDValue False,
1631 SDValue CC,
1632 DAGCombinerInfo &DCI) const {
1633 if ((LHS == True && RHS == False) || (LHS == False && RHS == True))
1634 return combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, True, False, CC, DCI);
1635
1636 SelectionDAG &DAG = DCI.DAG;
1637
1638 // If we can't directly match this, try to see if we can fold an fneg to
1639 // match.
1640
1641 ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
1642 ConstantFPSDNode *CFalse = dyn_cast<ConstantFPSDNode>(False);
1643 SDValue NegTrue = peekFNeg(True);
1644
1645 // Undo the combine foldFreeOpFromSelect does if it helps us match the
1646 // fmin/fmax.
1647 //
1648 // select (fcmp olt (lhs, K)), (fneg lhs), -K
1649 // -> fneg (fmin_legacy lhs, K)
1650 //
1651 // TODO: Use getNegatedExpression
1652 if (LHS == NegTrue && CFalse && CRHS) {
1653 APFloat NegRHS = neg(CRHS->getValueAPF());
1654 if (NegRHS == CFalse->getValueAPF()) {
1655 SDValue Combined =
1656 combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, NegTrue, False, CC, DCI);
1657 if (Combined)
1658 return DAG.getNode(ISD::FNEG, DL, VT, Combined);
1659 return SDValue();
1660 }
1661 }
1662
1663 return SDValue();
1664}
1665
1666std::pair<SDValue, SDValue>
1668 SDLoc SL(Op);
1669
1670 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1671
1672 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1673 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1674
1675 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1676 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1677
1678 return std::pair(Lo, Hi);
1679}
1680
1682 SDLoc SL(Op);
1683
1684 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1685 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1686 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1687}
1688
1690 SDLoc SL(Op);
1691
1692 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1693 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1694 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1695}
1696
1697// Split a vector type into two parts. The first part is a power of two vector.
1698// The second part is whatever is left over, and is a scalar if it would
1699// otherwise be a 1-vector.
1700std::pair<EVT, EVT>
1702 EVT LoVT, HiVT;
1703 EVT EltVT = VT.getVectorElementType();
1704 unsigned NumElts = VT.getVectorNumElements();
1705 unsigned LoNumElts = PowerOf2Ceil((NumElts + 1) / 2);
1706 LoVT = EVT::getVectorVT(*DAG.getContext(), EltVT, LoNumElts);
1707 HiVT = NumElts - LoNumElts == 1
1708 ? EltVT
1709 : EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts - LoNumElts);
1710 return std::pair(LoVT, HiVT);
1711}
1712
1713// Split a vector value into two parts of types LoVT and HiVT. HiVT could be
1714// scalar.
1715std::pair<SDValue, SDValue>
1717 const EVT &LoVT, const EVT &HiVT,
1718 SelectionDAG &DAG) const {
1720 (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <=
1721 N.getValueType().getVectorNumElements() &&
1722 "More vector elements requested than available!");
1724 DAG.getVectorIdxConstant(0, DL));
1725 SDValue Hi = DAG.getNode(
1727 HiVT, N, DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), DL));
1728 return std::pair(Lo, Hi);
1729}
1730
1732 SelectionDAG &DAG) const {
1733 LoadSDNode *Load = cast<LoadSDNode>(Op);
1734 EVT VT = Op.getValueType();
1735 SDLoc SL(Op);
1736
1737
1738 // If this is a 2 element vector, we really want to scalarize and not create
1739 // weird 1 element vectors.
1740 if (VT.getVectorNumElements() == 2) {
1741 SDValue Ops[2];
1742 std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG);
1743 return DAG.getMergeValues(Ops, SL);
1744 }
1745
1746 SDValue BasePtr = Load->getBasePtr();
1747 EVT MemVT = Load->getMemoryVT();
1748
1749 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1750
1751 EVT LoVT, HiVT;
1752 EVT LoMemVT, HiMemVT;
1753 SDValue Lo, Hi;
1754
1755 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1756 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1757 std::tie(Lo, Hi) = splitVector(Op, SL, LoVT, HiVT, DAG);
1758
1759 unsigned Size = LoMemVT.getStoreSize();
1760 Align BaseAlign = Load->getAlign();
1761 Align HiAlign = commonAlignment(BaseAlign, Size);
1762
1763 SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
1764 Load->getChain(), BasePtr, SrcValue, LoMemVT,
1765 BaseAlign, Load->getMemOperand()->getFlags());
1766 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Size));
1767 SDValue HiLoad =
1768 DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(),
1769 HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()),
1770 HiMemVT, HiAlign, Load->getMemOperand()->getFlags());
1771
1772 SDValue Join;
1773 if (LoVT == HiVT) {
1774 // This is the case that the vector is power of two so was evenly split.
1775 Join = DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad);
1776 } else {
1777 Join = DAG.getNode(ISD::INSERT_SUBVECTOR, SL, VT, DAG.getUNDEF(VT), LoLoad,
1778 DAG.getVectorIdxConstant(0, SL));
1779 Join = DAG.getNode(
1781 VT, Join, HiLoad,
1783 }
1784
1785 SDValue Ops[] = {Join, DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
1786 LoLoad.getValue(1), HiLoad.getValue(1))};
1787
1788 return DAG.getMergeValues(Ops, SL);
1789}
1790
1792 SelectionDAG &DAG) const {
1793 LoadSDNode *Load = cast<LoadSDNode>(Op);
1794 EVT VT = Op.getValueType();
1795 SDValue BasePtr = Load->getBasePtr();
1796 EVT MemVT = Load->getMemoryVT();
1797 SDLoc SL(Op);
1798 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1799 Align BaseAlign = Load->getAlign();
1800 unsigned NumElements = MemVT.getVectorNumElements();
1801
1802 // Widen from vec3 to vec4 when the load is at least 8-byte aligned
1803 // or 16-byte fully dereferenceable. Otherwise, split the vector load.
1804 if (NumElements != 3 ||
1805 (BaseAlign < Align(8) &&
1806 !SrcValue.isDereferenceable(16, *DAG.getContext(), DAG.getDataLayout())))
1807 return SplitVectorLoad(Op, DAG);
1808
1809 assert(NumElements == 3);
1810
1811 EVT WideVT =
1813 EVT WideMemVT =
1815 SDValue WideLoad = DAG.getExtLoad(
1816 Load->getExtensionType(), SL, WideVT, Load->getChain(), BasePtr, SrcValue,
1817 WideMemVT, BaseAlign, Load->getMemOperand()->getFlags());
1818 return DAG.getMergeValues(
1819 {DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, VT, WideLoad,
1820 DAG.getVectorIdxConstant(0, SL)),
1821 WideLoad.getValue(1)},
1822 SL);
1823}
1824
1826 SelectionDAG &DAG) const {
1827 StoreSDNode *Store = cast<StoreSDNode>(Op);
1828 SDValue Val = Store->getValue();
1829 EVT VT = Val.getValueType();
1830
1831 // If this is a 2 element vector, we really want to scalarize and not create
1832 // weird 1 element vectors.
1833 if (VT.getVectorNumElements() == 2)
1834 return scalarizeVectorStore(Store, DAG);
1835
1836 EVT MemVT = Store->getMemoryVT();
1837 SDValue Chain = Store->getChain();
1838 SDValue BasePtr = Store->getBasePtr();
1839 SDLoc SL(Op);
1840
1841 EVT LoVT, HiVT;
1842 EVT LoMemVT, HiMemVT;
1843 SDValue Lo, Hi;
1844
1845 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1846 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1847 std::tie(Lo, Hi) = splitVector(Val, SL, LoVT, HiVT, DAG);
1848
1849 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize());
1850
1851 const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
1852 Align BaseAlign = Store->getAlign();
1853 unsigned Size = LoMemVT.getStoreSize();
1854 Align HiAlign = commonAlignment(BaseAlign, Size);
1855
1856 SDValue LoStore =
1857 DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
1858 Store->getMemOperand()->getFlags());
1859 SDValue HiStore =
1860 DAG.getTruncStore(Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size),
1861 HiMemVT, HiAlign, Store->getMemOperand()->getFlags());
1862
1863 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
1864}
1865
1866// This is a shortcut for integer division because we have fast i32<->f32
1867// conversions, and fast f32 reciprocal instructions. The fractional part of a
1868// float is enough to accurately represent up to a 24-bit signed integer.
1870 bool Sign) const {
1871 SDLoc DL(Op);
1872 EVT VT = Op.getValueType();
1873 SDValue LHS = Op.getOperand(0);
1874 SDValue RHS = Op.getOperand(1);
1875 MVT IntVT = MVT::i32;
1876 MVT FltVT = MVT::f32;
1877
1878 unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS);
1879 if (LHSSignBits < 9)
1880 return SDValue();
1881
1882 unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS);
1883 if (RHSSignBits < 9)
1884 return SDValue();
1885
1886 unsigned BitSize = VT.getSizeInBits();
1887 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1888 unsigned DivBits = BitSize - SignBits;
1889 if (Sign)
1890 ++DivBits;
1891
1894
1895 SDValue jq = DAG.getConstant(1, DL, IntVT);
1896
1897 if (Sign) {
1898 // char|short jq = ia ^ ib;
1899 jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
1900
1901 // jq = jq >> (bitsize - 2)
1902 jq = DAG.getNode(ISD::SRA, DL, VT, jq,
1903 DAG.getConstant(BitSize - 2, DL, VT));
1904
1905 // jq = jq | 0x1
1906 jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));
1907 }
1908
1909 // int ia = (int)LHS;
1910 SDValue ia = LHS;
1911
1912 // int ib, (int)RHS;
1913 SDValue ib = RHS;
1914
1915 // float fa = (float)ia;
1916 SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
1917
1918 // float fb = (float)ib;
1919 SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
1920
1921 SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
1922 fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
1923
1924 // fq = trunc(fq);
1925 fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);
1926
1927 // float fqneg = -fq;
1928 SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
1929
1931
1932 bool UseFmadFtz = false;
1933 if (Subtarget->isGCN()) {
1935 UseFmadFtz =
1937 }
1938
1939 // float fr = mad(fqneg, fb, fa);
1940 unsigned OpCode = !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
1941 : UseFmadFtz ? (unsigned)AMDGPUISD::FMAD_FTZ
1943 SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa);
1944
1945 // int iq = (int)fq;
1946 SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
1947
1948 // fr = fabs(fr);
1949 fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
1950
1951 // fb = fabs(fb);
1952 fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
1953
1954 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
1955
1956 // int cv = fr >= fb;
1957 SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
1958
1959 // jq = (cv ? jq : 0);
1960 jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));
1961
1962 // dst = iq + jq;
1963 SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
1964
1965 // Rem needs compensation, it's easier to recompute it
1966 SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
1967 Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
1968
1969 // Truncate to number of bits this divide really is.
1970 if (Sign) {
1971 SDValue InRegSize
1972 = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits));
1973 Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize);
1974 Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize);
1975 } else {
1976 SDValue TruncMask = DAG.getConstant((UINT64_C(1) << DivBits) - 1, DL, VT);
1977 Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask);
1978 Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask);
1979 }
1980
1981 return DAG.getMergeValues({ Div, Rem }, DL);
1982}
1983
1985 SelectionDAG &DAG,
1987 SDLoc DL(Op);
1988 EVT VT = Op.getValueType();
1989
1990 assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64");
1991
1992 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
1993
1994 SDValue One = DAG.getConstant(1, DL, HalfVT);
1995 SDValue Zero = DAG.getConstant(0, DL, HalfVT);
1996
1997 //HiLo split
1998 SDValue LHS_Lo, LHS_Hi;
1999 SDValue LHS = Op.getOperand(0);
2000 std::tie(LHS_Lo, LHS_Hi) = DAG.SplitScalar(LHS, DL, HalfVT, HalfVT);
2001
2002 SDValue RHS_Lo, RHS_Hi;
2003 SDValue RHS = Op.getOperand(1);
2004 std::tie(RHS_Lo, RHS_Hi) = DAG.SplitScalar(RHS, DL, HalfVT, HalfVT);
2005
2006 if (DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
2008
2009 SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2010 LHS_Lo, RHS_Lo);
2011
2012 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), Zero});
2013 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), Zero});
2014
2015 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));
2016 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));
2017 return;
2018 }
2019
2020 if (isTypeLegal(MVT::i64)) {
2021 // The algorithm here is based on ideas from "Software Integer Division",
2022 // Tom Rodeheffer, August 2008.
2023
2026
2027 // Compute denominator reciprocal.
2028 unsigned FMAD =
2029 !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
2032 : (unsigned)AMDGPUISD::FMAD_FTZ;
2033
2034 SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo);
2035 SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi);
2036 SDValue Mad1 = DAG.getNode(FMAD, DL, MVT::f32, Cvt_Hi,
2037 DAG.getConstantFP(APInt(32, 0x4f800000).bitsToFloat(), DL, MVT::f32),
2038 Cvt_Lo);
2039 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, DL, MVT::f32, Mad1);
2040 SDValue Mul1 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Rcp,
2041 DAG.getConstantFP(APInt(32, 0x5f7ffffc).bitsToFloat(), DL, MVT::f32));
2042 SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Mul1,
2043 DAG.getConstantFP(APInt(32, 0x2f800000).bitsToFloat(), DL, MVT::f32));
2044 SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, MVT::f32, Mul2);
2045 SDValue Mad2 = DAG.getNode(FMAD, DL, MVT::f32, Trunc,
2046 DAG.getConstantFP(APInt(32, 0xcf800000).bitsToFloat(), DL, MVT::f32),
2047 Mul1);
2048 SDValue Rcp_Lo = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Mad2);
2049 SDValue Rcp_Hi = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Trunc);
2050 SDValue Rcp64 = DAG.getBitcast(VT,
2051 DAG.getBuildVector(MVT::v2i32, DL, {Rcp_Lo, Rcp_Hi}));
2052
2053 SDValue Zero64 = DAG.getConstant(0, DL, VT);
2054 SDValue One64 = DAG.getConstant(1, DL, VT);
2055 SDValue Zero1 = DAG.getConstant(0, DL, MVT::i1);
2056 SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1);
2057
2058 // First round of UNR (Unsigned integer Newton-Raphson).
2059 SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS);
2060 SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64);
2061 SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1);
2062 SDValue Mulhi1_Lo, Mulhi1_Hi;
2063 std::tie(Mulhi1_Lo, Mulhi1_Hi) =
2064 DAG.SplitScalar(Mulhi1, DL, HalfVT, HalfVT);
2065 SDValue Add1_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Lo,
2066 Mulhi1_Lo, Zero1);
2067 SDValue Add1_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Hi,
2068 Mulhi1_Hi, Add1_Lo.getValue(1));
2069 SDValue Add1 = DAG.getBitcast(VT,
2070 DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi}));
2071
2072 // Second round of UNR.
2073 SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1);
2074 SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2);
2075 SDValue Mulhi2_Lo, Mulhi2_Hi;
2076 std::tie(Mulhi2_Lo, Mulhi2_Hi) =
2077 DAG.SplitScalar(Mulhi2, DL, HalfVT, HalfVT);
2078 SDValue Add2_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Lo,
2079 Mulhi2_Lo, Zero1);
2080 SDValue Add2_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Hi,
2081 Mulhi2_Hi, Add2_Lo.getValue(1));
2082 SDValue Add2 = DAG.getBitcast(VT,
2083 DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi}));
2084
2085 SDValue Mulhi3 = DAG.getNode(ISD::MULHU, DL, VT, LHS, Add2);
2086
2087 SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3);
2088
2089 SDValue Mul3_Lo, Mul3_Hi;
2090 std::tie(Mul3_Lo, Mul3_Hi) = DAG.SplitScalar(Mul3, DL, HalfVT, HalfVT);
2091 SDValue Sub1_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Lo,
2092 Mul3_Lo, Zero1);
2093 SDValue Sub1_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Hi,
2094 Mul3_Hi, Sub1_Lo.getValue(1));
2095 SDValue Sub1_Mi = DAG.getNode(ISD::SUB, DL, HalfVT, LHS_Hi, Mul3_Hi);
2096 SDValue Sub1 = DAG.getBitcast(VT,
2097 DAG.getBuildVector(MVT::v2i32, DL, {Sub1_Lo, Sub1_Hi}));
2098
2099 SDValue MinusOne = DAG.getConstant(0xffffffffu, DL, HalfVT);
2100 SDValue C1 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, MinusOne, Zero,
2101 ISD::SETUGE);
2102 SDValue C2 = DAG.getSelectCC(DL, Sub1_Lo, RHS_Lo, MinusOne, Zero,
2103 ISD::SETUGE);
2104 SDValue C3 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, C2, C1, ISD::SETEQ);
2105
2106 // TODO: Here and below portions of the code can be enclosed into if/endif.
2107 // Currently control flow is unconditional and we have 4 selects after
2108 // potential endif to substitute PHIs.
2109
2110 // if C3 != 0 ...
2111 SDValue Sub2_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Lo,
2112 RHS_Lo, Zero1);
2113 SDValue Sub2_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Mi,
2114 RHS_Hi, Sub1_Lo.getValue(1));
2115 SDValue Sub2_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi,
2116 Zero, Sub2_Lo.getValue(1));
2117 SDValue Sub2 = DAG.getBitcast(VT,
2118 DAG.getBuildVector(MVT::v2i32, DL, {Sub2_Lo, Sub2_Hi}));
2119
2120 SDValue Add3 = DAG.getNode(ISD::ADD, DL, VT, Mulhi3, One64);
2121
2122 SDValue C4 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, MinusOne, Zero,
2123 ISD::SETUGE);
2124 SDValue C5 = DAG.getSelectCC(DL, Sub2_Lo, RHS_Lo, MinusOne, Zero,
2125 ISD::SETUGE);
2126 SDValue C6 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, C5, C4, ISD::SETEQ);
2127
2128 // if (C6 != 0)
2129 SDValue Add4 = DAG.getNode(ISD::ADD, DL, VT, Add3, One64);
2130
2131 SDValue Sub3_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Lo,
2132 RHS_Lo, Zero1);
2133 SDValue Sub3_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi,
2134 RHS_Hi, Sub2_Lo.getValue(1));
2135 SDValue Sub3_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub3_Mi,
2136 Zero, Sub3_Lo.getValue(1));
2137 SDValue Sub3 = DAG.getBitcast(VT,
2138 DAG.getBuildVector(MVT::v2i32, DL, {Sub3_Lo, Sub3_Hi}));
2139
2140 // endif C6
2141 // endif C3
2142
2143 SDValue Sel1 = DAG.getSelectCC(DL, C6, Zero, Add4, Add3, ISD::SETNE);
2144 SDValue Div = DAG.getSelectCC(DL, C3, Zero, Sel1, Mulhi3, ISD::SETNE);
2145
2146 SDValue Sel2 = DAG.getSelectCC(DL, C6, Zero, Sub3, Sub2, ISD::SETNE);
2147 SDValue Rem = DAG.getSelectCC(DL, C3, Zero, Sel2, Sub1, ISD::SETNE);
2148
2149 Results.push_back(Div);
2150 Results.push_back(Rem);
2151
2152 return;
2153 }
2154
2155 // r600 expandion.
2156 // Get Speculative values
2157 SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
2158 SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
2159
2160 SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, Zero, REM_Part, LHS_Hi, ISD::SETEQ);
2161 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, Zero});
2162 REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);
2163
2164 SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, Zero, DIV_Part, Zero, ISD::SETEQ);
2165 SDValue DIV_Lo = Zero;
2166
2167 const unsigned halfBitWidth = HalfVT.getSizeInBits();
2168
2169 for (unsigned i = 0; i < halfBitWidth; ++i) {
2170 const unsigned bitPos = halfBitWidth - i - 1;
2171 SDValue POS = DAG.getConstant(bitPos, DL, HalfVT);
2172 // Get value of high bit
2173 SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
2174 HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, One);
2175 HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
2176
2177 // Shift
2178 REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT));
2179 // Add LHS high bit
2180 REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
2181
2182 SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT);
2183 SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, Zero, ISD::SETUGE);
2184
2185 DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
2186
2187 // Update REM
2188 SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
2189 REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
2190 }
2191
2192 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});
2193 DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);
2194 Results.push_back(DIV);
2195 Results.push_back(REM);
2196}
2197
2199 SelectionDAG &DAG) const {
2200 SDLoc DL(Op);
2201 EVT VT = Op.getValueType();
2202
2203 if (VT == MVT::i64) {
2205 LowerUDIVREM64(Op, DAG, Results);
2206 return DAG.getMergeValues(Results, DL);
2207 }
2208
2209 if (VT == MVT::i32) {
2210 if (SDValue Res = LowerDIVREM24(Op, DAG, false))
2211 return Res;
2212 }
2213
2214 SDValue X = Op.getOperand(0);
2215 SDValue Y = Op.getOperand(1);
2216
2217 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2218 // algorithm used here.
2219
2220 // Initial estimate of inv(y).
2221 SDValue Z = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Y);
2222
2223 // One round of UNR.
2224 SDValue NegY = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Y);
2225 SDValue NegYZ = DAG.getNode(ISD::MUL, DL, VT, NegY, Z);
2226 Z = DAG.getNode(ISD::ADD, DL, VT, Z,
2227 DAG.getNode(ISD::MULHU, DL, VT, Z, NegYZ));
2228
2229 // Quotient/remainder estimate.
2230 SDValue Q = DAG.getNode(ISD::MULHU, DL, VT, X, Z);
2231 SDValue R =
2232 DAG.getNode(ISD::SUB, DL, VT, X, DAG.getNode(ISD::MUL, DL, VT, Q, Y));
2233
2234 // First quotient/remainder refinement.
2235 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2236 SDValue One = DAG.getConstant(1, DL, VT);
2237 SDValue Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2238 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2239 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2240 R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2241 DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2242
2243 // Second quotient/remainder refinement.
2244 Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2245 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2246 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2247 R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2248 DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2249
2250 return DAG.getMergeValues({Q, R}, DL);
2251}
2252
2254 SelectionDAG &DAG) const {
2255 SDLoc DL(Op);
2256 EVT VT = Op.getValueType();
2257
2258 SDValue LHS = Op.getOperand(0);
2259 SDValue RHS = Op.getOperand(1);
2260
2261 SDValue Zero = DAG.getConstant(0, DL, VT);
2262 SDValue NegOne = DAG.getConstant(-1, DL, VT);
2263
2264 if (VT == MVT::i32) {
2265 if (SDValue Res = LowerDIVREM24(Op, DAG, true))
2266 return Res;
2267 }
2268
2269 if (VT == MVT::i64 &&
2270 DAG.ComputeNumSignBits(LHS) > 32 &&
2271 DAG.ComputeNumSignBits(RHS) > 32) {
2272 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
2273
2274 //HiLo split
2275 SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
2276 SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
2277 SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2278 LHS_Lo, RHS_Lo);
2279 SDValue Res[2] = {
2280 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
2281 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
2282 };
2283 return DAG.getMergeValues(Res, DL);
2284 }
2285
2286 SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
2287 SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
2288 SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
2289 SDValue RSign = LHSign; // Remainder sign is the same as LHS
2290
2291 LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
2292 RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
2293
2294 LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
2295 RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
2296
2297 SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
2298 SDValue Rem = Div.getValue(1);
2299
2300 Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
2301 Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
2302
2303 Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
2304 Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
2305
2306 SDValue Res[2] = {
2307 Div,
2308 Rem
2309 };
2310 return DAG.getMergeValues(Res, DL);
2311}
2312
2313// (frem x, y) -> (fma (fneg (ftrunc (fdiv x, y))), y, x)
2315 SDLoc SL(Op);
2316 EVT VT = Op.getValueType();
2317 auto Flags = Op->getFlags();
2318 SDValue X = Op.getOperand(0);
2319 SDValue Y = Op.getOperand(1);
2320
2321 SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y, Flags);
2322 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, VT, Div, Flags);
2323 SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Trunc, Flags);
2324 // TODO: For f32 use FMAD instead if !hasFastFMA32?
2325 return DAG.getNode(ISD::FMA, SL, VT, Neg, Y, X, Flags);
2326}
2327
2329 SDLoc SL(Op);
2330 SDValue Src = Op.getOperand(0);
2331
2332 // result = trunc(src)
2333 // if (src > 0.0 && src != result)
2334 // result += 1.0
2335
2336 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2337
2338 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2339 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
2340
2341 EVT SetCCVT =
2342 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2343
2344 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
2345 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2346 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2347
2348 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
2349 // TODO: Should this propagate fast-math-flags?
2350 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2351}
2352
2354 SelectionDAG &DAG) {
2355 const unsigned FractBits = 52;
2356 const unsigned ExpBits = 11;
2357
2358 SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
2359 Hi,
2360 DAG.getConstant(FractBits - 32, SL, MVT::i32),
2361 DAG.getConstant(ExpBits, SL, MVT::i32));
2362 SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
2363 DAG.getConstant(1023, SL, MVT::i32));
2364
2365 return Exp;
2366}
2367
2369 SDLoc SL(Op);
2370 SDValue Src = Op.getOperand(0);
2371
2372 assert(Op.getValueType() == MVT::f64);
2373
2374 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2375
2376 // Extract the upper half, since this is where we will find the sign and
2377 // exponent.
2378 SDValue Hi = getHiHalf64(Src, DAG);
2379
2380 SDValue Exp = extractF64Exponent(Hi, SL, DAG);
2381
2382 const unsigned FractBits = 52;
2383
2384 // Extract the sign bit.
2385 const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32);
2386 SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
2387
2388 // Extend back to 64-bits.
2389 SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
2390 SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
2391
2392 SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
2393 const SDValue FractMask
2394 = DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64);
2395
2396 SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
2397 SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
2398 SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
2399
2400 EVT SetCCVT =
2401 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
2402
2403 const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);
2404
2405 SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
2406 SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
2407
2408 SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
2409 SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
2410
2411 return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
2412}
2413
2415 SelectionDAG &DAG) const {
2416 SDLoc SL(Op);
2417 SDValue Src = Op.getOperand(0);
2418
2419 assert(Op.getValueType() == MVT::f64);
2420
2421 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2422 SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
2423 SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
2424
2425 // TODO: Should this propagate fast-math-flags?
2426
2427 SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
2428 SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
2429
2430 SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
2431
2432 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2433 SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
2434
2435 EVT SetCCVT =
2436 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2437 SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
2438
2439 return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
2440}
2441
2443 SelectionDAG &DAG) const {
2444 // FNEARBYINT and FRINT are the same, except in their handling of FP
2445 // exceptions. Those aren't really meaningful for us, and OpenCL only has
2446 // rint, so just treat them as equivalent.
2447 return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), Op.getValueType(),
2448 Op.getOperand(0));
2449}
2450
2452 auto VT = Op.getValueType();
2453 auto Arg = Op.getOperand(0u);
2454 return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), VT, Arg);
2455}
2456
2457// XXX - May require not supporting f32 denormals?
2458
2459// Don't handle v2f16. The extra instructions to scalarize and repack around the
2460// compare and vselect end up producing worse code than scalarizing the whole
2461// operation.
2463 SDLoc SL(Op);
2464 SDValue X = Op.getOperand(0);
2465 EVT VT = Op.getValueType();
2466
2467 SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X);
2468
2469 // TODO: Should this propagate fast-math-flags?
2470
2471 SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T);
2472
2473 SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff);
2474
2475 const SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2476 const SDValue One = DAG.getConstantFP(1.0, SL, VT);
2477
2478 EVT SetCCVT =
2479 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2480
2481 const SDValue Half = DAG.getConstantFP(0.5, SL, VT);
2482 SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
2483 SDValue OneOrZeroFP = DAG.getNode(ISD::SELECT, SL, VT, Cmp, One, Zero);
2484
2485 SDValue SignedOffset = DAG.getNode(ISD::FCOPYSIGN, SL, VT, OneOrZeroFP, X);
2486 return DAG.getNode(ISD::FADD, SL, VT, T, SignedOffset);
2487}
2488
2490 SDLoc SL(Op);
2491 SDValue Src = Op.getOperand(0);
2492
2493 // result = trunc(src);
2494 // if (src < 0.0 && src != result)
2495 // result += -1.0.
2496
2497 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2498
2499 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2500 const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);
2501
2502 EVT SetCCVT =
2503 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2504
2505 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
2506 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2507 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2508
2509 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
2510 // TODO: Should this propagate fast-math-flags?
2511 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2512}
2513
2514/// Return true if it's known that \p Src can never be an f32 denormal value.
2516 switch (Src.getOpcode()) {
2517 case ISD::FP_EXTEND:
2518 return Src.getOperand(0).getValueType() == MVT::f16;
2519 case ISD::FP16_TO_FP:
2520 case ISD::FFREXP:
2521 return true;
2523 unsigned IntrinsicID = Src.getConstantOperandVal(0);
2524 switch (IntrinsicID) {
2525 case Intrinsic::amdgcn_frexp_mant:
2526 return true;
2527 default:
2528 return false;
2529 }
2530 }
2531 default:
2532 return false;
2533 }
2534
2535 llvm_unreachable("covered opcode switch");
2536}
2537
2539 SDNodeFlags Flags) {
2540 if (Flags.hasApproximateFuncs())
2541 return true;
2542 auto &Options = DAG.getTarget().Options;
2543 return Options.UnsafeFPMath || Options.ApproxFuncFPMath;
2544}
2545
2547 SDValue Src,
2548 SDNodeFlags Flags) {
2549 return !valueIsKnownNeverF32Denorm(Src) &&
2550 DAG.getMachineFunction()
2553}
2554
2556 SDValue Src,
2557 SDNodeFlags Flags) const {
2558 SDLoc SL(Src);
2559 EVT VT = Src.getValueType();
2561 SDValue SmallestNormal =
2562 DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT);
2563
2564 // Want to scale denormals up, but negatives and 0 work just as well on the
2565 // scaled path.
2566 SDValue IsLtSmallestNormal = DAG.getSetCC(
2567 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src,
2568 SmallestNormal, ISD::SETOLT);
2569
2570 return IsLtSmallestNormal;
2571}
2572
2574 SDNodeFlags Flags) const {
2575 SDLoc SL(Src);
2576 EVT VT = Src.getValueType();
2578 SDValue Inf = DAG.getConstantFP(APFloat::getInf(Semantics), SL, VT);
2579
2580 SDValue Fabs = DAG.getNode(ISD::FABS, SL, VT, Src, Flags);
2581 SDValue IsFinite = DAG.getSetCC(
2582 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Fabs,
2583 Inf, ISD::SETOLT);
2584 return IsFinite;
2585}
2586
2587/// If denormal handling is required return the scaled input to FLOG2, and the
2588/// check for denormal range. Otherwise, return null values.
2589std::pair<SDValue, SDValue>
2591 SDValue Src, SDNodeFlags Flags) const {
2592 if (!needsDenormHandlingF32(DAG, Src, Flags))
2593 return {};
2594
2595 MVT VT = MVT::f32;
2596 const fltSemantics &Semantics = APFloat::IEEEsingle();
2597 SDValue SmallestNormal =
2598 DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT);
2599
2600 SDValue IsLtSmallestNormal = DAG.getSetCC(
2601 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src,
2602 SmallestNormal, ISD::SETOLT);
2603
2604 SDValue Scale32 = DAG.getConstantFP(0x1.0p+32, SL, VT);
2605 SDValue One = DAG.getConstantFP(1.0, SL, VT);
2606 SDValue ScaleFactor =
2607 DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, Scale32, One, Flags);
2608
2609 SDValue ScaledInput = DAG.getNode(ISD::FMUL, SL, VT, Src, ScaleFactor, Flags);
2610 return {ScaledInput, IsLtSmallestNormal};
2611}
2612
2614 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
2615 // If we have to handle denormals, scale up the input and adjust the result.
2616
2617 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
2618 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
2619
2620 SDLoc SL(Op);
2621 EVT VT = Op.getValueType();
2622 SDValue Src = Op.getOperand(0);
2623 SDNodeFlags Flags = Op->getFlags();
2624
2625 if (VT == MVT::f16) {
2626 // Nothing in half is a denormal when promoted to f32.
2627 assert(!Subtarget->has16BitInsts());
2628 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);
2629 SDValue Log = DAG.getNode(AMDGPUISD::LOG, SL, MVT::f32, Ext, Flags);
2630 return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,
2631 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
2632 }
2633
2634 auto [ScaledInput, IsLtSmallestNormal] =
2635 getScaledLogInput(DAG, SL, Src, Flags);
2636 if (!ScaledInput)
2637 return DAG.getNode(AMDGPUISD::LOG, SL, VT, Src, Flags);
2638
2639 SDValue Log2 = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags);
2640
2641 SDValue ThirtyTwo = DAG.getConstantFP(32.0, SL, VT);
2642 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2643 SDValue ResultOffset =
2644 DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, ThirtyTwo, Zero);
2645 return DAG.getNode(ISD::FSUB, SL, VT, Log2, ResultOffset, Flags);
2646}
2647
2648static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X,
2649 SDValue Y, SDValue C, SDNodeFlags Flags = SDNodeFlags()) {
2650 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Y, Flags);
2651 return DAG.getNode(ISD::FADD, SL, VT, Mul, C, Flags);
2652}
2653
2655 SelectionDAG &DAG) const {
2656 SDValue X = Op.getOperand(0);
2657 EVT VT = Op.getValueType();
2658 SDNodeFlags Flags = Op->getFlags();
2659 SDLoc DL(Op);
2660
2661 const bool IsLog10 = Op.getOpcode() == ISD::FLOG10;
2662 assert(IsLog10 || Op.getOpcode() == ISD::FLOG);
2663
2664 const auto &Options = getTargetMachine().Options;
2665 if (VT == MVT::f16 || Flags.hasApproximateFuncs() ||
2666 Options.ApproxFuncFPMath || Options.UnsafeFPMath) {
2667
2668 if (VT == MVT::f16 && !Subtarget->has16BitInsts()) {
2669 // Log and multiply in f32 is good enough for f16.
2670 X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X, Flags);
2671 }
2672
2673 SDValue Lowered = LowerFLOGUnsafe(X, DL, DAG, IsLog10, Flags);
2674 if (VT == MVT::f16 && !Subtarget->has16BitInsts()) {
2675 return DAG.getNode(ISD::FP_ROUND, DL, VT, Lowered,
2676 DAG.getTargetConstant(0, DL, MVT::i32), Flags);
2677 }
2678
2679 return Lowered;
2680 }
2681
2682 auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, DL, X, Flags);
2683 if (ScaledInput)
2684 X = ScaledInput;
2685
2686 SDValue Y = DAG.getNode(AMDGPUISD::LOG, DL, VT, X, Flags);
2687
2688 SDValue R;
2689 if (Subtarget->hasFastFMAF32()) {
2690 // c+cc are ln(2)/ln(10) to more than 49 bits
2691 const float c_log10 = 0x1.344134p-2f;
2692 const float cc_log10 = 0x1.09f79ep-26f;
2693
2694 // c + cc is ln(2) to more than 49 bits
2695 const float c_log = 0x1.62e42ep-1f;
2696 const float cc_log = 0x1.efa39ep-25f;
2697
2698 SDValue C = DAG.getConstantFP(IsLog10 ? c_log10 : c_log, DL, VT);
2699 SDValue CC = DAG.getConstantFP(IsLog10 ? cc_log10 : cc_log, DL, VT);
2700
2701 R = DAG.getNode(ISD::FMUL, DL, VT, Y, C, Flags);
2702 SDValue NegR = DAG.getNode(ISD::FNEG, DL, VT, R, Flags);
2703 SDValue FMA0 = DAG.getNode(ISD::FMA, DL, VT, Y, C, NegR, Flags);
2704 SDValue FMA1 = DAG.getNode(ISD::FMA, DL, VT, Y, CC, FMA0, Flags);
2705 R = DAG.getNode(ISD::FADD, DL, VT, R, FMA1, Flags);
2706 } else {
2707 // ch+ct is ln(2)/ln(10) to more than 36 bits
2708 const float ch_log10 = 0x1.344000p-2f;
2709 const float ct_log10 = 0x1.3509f6p-18f;
2710
2711 // ch + ct is ln(2) to more than 36 bits
2712 const float ch_log = 0x1.62e000p-1f;
2713 const float ct_log = 0x1.0bfbe8p-15f;
2714
2715 SDValue CH = DAG.getConstantFP(IsLog10 ? ch_log10 : ch_log, DL, VT);
2716 SDValue CT = DAG.getConstantFP(IsLog10 ? ct_log10 : ct_log, DL, VT);
2717
2718 SDValue YAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Y);
2719 SDValue MaskConst = DAG.getConstant(0xfffff000, DL, MVT::i32);
2720 SDValue YHInt = DAG.getNode(ISD::AND, DL, MVT::i32, YAsInt, MaskConst);
2721 SDValue YH = DAG.getNode(ISD::BITCAST, DL, MVT::f32, YHInt);
2722 SDValue YT = DAG.getNode(ISD::FSUB, DL, VT, Y, YH, Flags);
2723
2724 SDValue YTCT = DAG.getNode(ISD::FMUL, DL, VT, YT, CT, Flags);
2725 SDValue Mad0 = getMad(DAG, DL, VT, YH, CT, YTCT, Flags);
2726 SDValue Mad1 = getMad(DAG, DL, VT, YT, CH, Mad0, Flags);
2727 R = getMad(DAG, DL, VT, YH, CH, Mad1);
2728 }
2729
2730 const bool IsFiniteOnly = (Flags.hasNoNaNs() || Options.NoNaNsFPMath) &&
2731 (Flags.hasNoInfs() || Options.NoInfsFPMath);
2732
2733 // TODO: Check if known finite from source value.
2734 if (!IsFiniteOnly) {
2735 SDValue IsFinite = getIsFinite(DAG, Y, Flags);
2736 R = DAG.getNode(ISD::SELECT, DL, VT, IsFinite, R, Y, Flags);
2737 }
2738
2739 if (IsScaled) {
2740 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
2741 SDValue ShiftK =
2742 DAG.getConstantFP(IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f, DL, VT);
2743 SDValue Shift =
2744 DAG.getNode(ISD::SELECT, DL, VT, IsScaled, ShiftK, Zero, Flags);
2745 R = DAG.getNode(ISD::FSUB, DL, VT, R, Shift, Flags);
2746 }
2747
2748 return R;
2749}
2750
2752 return LowerFLOGCommon(Op, DAG);
2753}
2754
2755// Do f32 fast math expansion for flog2 or flog10. This is accurate enough for a
2756// promote f16 operation.
2758 SelectionDAG &DAG, bool IsLog10,
2759 SDNodeFlags Flags) const {
2760 EVT VT = Src.getValueType();
2761 unsigned LogOp =
2762 VT == MVT::f32 ? (unsigned)AMDGPUISD::LOG : (unsigned)ISD::FLOG2;
2763
2764 double Log2BaseInverted =
2766
2767 if (VT == MVT::f32) {
2768 auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, SL, Src, Flags);
2769 if (ScaledInput) {
2770 SDValue LogSrc = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags);
2771 SDValue ScaledResultOffset =
2772 DAG.getConstantFP(-32.0 * Log2BaseInverted, SL, VT);
2773
2774 SDValue Zero = DAG.getConstantFP(0.0f, SL, VT);
2775
2776 SDValue ResultOffset = DAG.getNode(ISD::SELECT, SL, VT, IsScaled,
2777 ScaledResultOffset, Zero, Flags);
2778
2779 SDValue Log2Inv = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2780
2781 if (Subtarget->hasFastFMAF32())
2782 return DAG.getNode(ISD::FMA, SL, VT, LogSrc, Log2Inv, ResultOffset,
2783 Flags);
2784 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, LogSrc, Log2Inv, Flags);
2785 return DAG.getNode(ISD::FADD, SL, VT, Mul, ResultOffset);
2786 }
2787 }
2788
2789 SDValue Log2Operand = DAG.getNode(LogOp, SL, VT, Src, Flags);
2790 SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2791
2792 return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand,
2793 Flags);
2794}
2795
2797 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
2798 // If we have to handle denormals, scale up the input and adjust the result.
2799
2800 SDLoc SL(Op);
2801 EVT VT = Op.getValueType();
2802 SDValue Src = Op.getOperand(0);
2803 SDNodeFlags Flags = Op->getFlags();
2804
2805 if (VT == MVT::f16) {
2806 // Nothing in half is a denormal when promoted to f32.
2807 assert(!Subtarget->has16BitInsts());
2808 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);
2809 SDValue Log = DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Ext, Flags);
2810 return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,
2811 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
2812 }
2813
2814 assert(VT == MVT::f32);
2815
2816 if (!needsDenormHandlingF32(DAG, Src, Flags))
2817 return DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Src, Flags);
2818
2819 // bool needs_scaling = x < -0x1.f80000p+6f;
2820 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
2821
2822 // -nextafter(128.0, -1)
2823 SDValue RangeCheckConst = DAG.getConstantFP(-0x1.f80000p+6f, SL, VT);
2824
2825 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2826
2827 SDValue NeedsScaling =
2828 DAG.getSetCC(SL, SetCCVT, Src, RangeCheckConst, ISD::SETOLT);
2829
2830 SDValue SixtyFour = DAG.getConstantFP(0x1.0p+6f, SL, VT);
2831 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2832
2833 SDValue AddOffset =
2834 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, SixtyFour, Zero);
2835
2836 SDValue AddInput = DAG.getNode(ISD::FADD, SL, VT, Src, AddOffset, Flags);
2837 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, AddInput, Flags);
2838
2839 SDValue TwoExpNeg64 = DAG.getConstantFP(0x1.0p-64f, SL, VT);
2840 SDValue One = DAG.getConstantFP(1.0, SL, VT);
2841 SDValue ResultScale =
2842 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, TwoExpNeg64, One);
2843
2844 return DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScale, Flags);
2845}
2846
2848 SelectionDAG &DAG,
2849 SDNodeFlags Flags) const {
2850 EVT VT = X.getValueType();
2851 const SDValue Log2E = DAG.getConstantFP(numbers::log2e, SL, VT);
2852
2853 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) {
2854 // exp2(M_LOG2E_F * f);
2855 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Log2E, Flags);
2856 return DAG.getNode(VT == MVT::f32 ? (unsigned)AMDGPUISD::EXP
2857 : (unsigned)ISD::FEXP2,
2858 SL, VT, Mul, Flags);
2859 }
2860
2861 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2862
2863 SDValue Threshold = DAG.getConstantFP(-0x1.5d58a0p+6f, SL, VT);
2864 SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT);
2865
2866 SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+6f, SL, VT);
2867
2868 SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags);
2869
2870 SDValue AdjustedX =
2871 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X);
2872
2873 SDValue ExpInput = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, Log2E, Flags);
2874
2875 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, ExpInput, Flags);
2876
2877 SDValue ResultScaleFactor = DAG.getConstantFP(0x1.969d48p-93f, SL, VT);
2878 SDValue AdjustedResult =
2879 DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScaleFactor, Flags);
2880
2881 return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, Exp2,
2882 Flags);
2883}
2884
2885/// Emit approx-funcs appropriate lowering for exp10. inf/nan should still be
2886/// handled correctly.
2888 SelectionDAG &DAG,
2889 SDNodeFlags Flags) const {
2890 const EVT VT = X.getValueType();
2891 const unsigned Exp2Op = VT == MVT::f32 ? AMDGPUISD::EXP : ISD::FEXP2;
2892
2893 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) {
2894 // exp2(x * 0x1.a92000p+1f) * exp2(x * 0x1.4f0978p-11f);
2895 SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT);
2896 SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT);
2897
2898 SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, X, K0, Flags);
2899 SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags);
2900 SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, X, K1, Flags);
2901 SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags);
2902 return DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1);
2903 }
2904
2905 // bool s = x < -0x1.2f7030p+5f;
2906 // x += s ? 0x1.0p+5f : 0.0f;
2907 // exp10 = exp2(x * 0x1.a92000p+1f) *
2908 // exp2(x * 0x1.4f0978p-11f) *
2909 // (s ? 0x1.9f623ep-107f : 1.0f);
2910
2911 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2912
2913 SDValue Threshold = DAG.getConstantFP(-0x1.2f7030p+5f, SL, VT);
2914 SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT);
2915
2916 SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+5f, SL, VT);
2917 SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags);
2918 SDValue AdjustedX =
2919 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X);
2920
2921 SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT);
2922 SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT);
2923
2924 SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K0, Flags);
2925 SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags);
2926 SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K1, Flags);
2927 SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags);
2928
2929 SDValue MulExps = DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1, Flags);
2930
2931 SDValue ResultScaleFactor = DAG.getConstantFP(0x1.9f623ep-107f, SL, VT);
2932 SDValue AdjustedResult =
2933 DAG.getNode(ISD::FMUL, SL, VT, MulExps, ResultScaleFactor, Flags);
2934
2935 return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, MulExps,
2936 Flags);
2937}
2938
2940 EVT VT = Op.getValueType();
2941 SDLoc SL(Op);
2942 SDValue X = Op.getOperand(0);
2943 SDNodeFlags Flags = Op->getFlags();
2944 const bool IsExp10 = Op.getOpcode() == ISD::FEXP10;
2945
2946 if (VT.getScalarType() == MVT::f16) {
2947 // v_exp_f16 (fmul x, log2e)
2948 if (allowApproxFunc(DAG, Flags)) // TODO: Does this really require fast?
2949 return lowerFEXPUnsafe(X, SL, DAG, Flags);
2950
2951 if (VT.isVector())
2952 return SDValue();
2953
2954 // exp(f16 x) ->
2955 // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
2956
2957 // Nothing in half is a denormal when promoted to f32.
2958 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, X, Flags);
2959 SDValue Lowered = lowerFEXPUnsafe(Ext, SL, DAG, Flags);
2960 return DAG.getNode(ISD::FP_ROUND, SL, VT, Lowered,
2961 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
2962 }
2963
2964 assert(VT == MVT::f32);
2965
2966 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
2967 // library behavior. Also, is known-not-daz source sufficient?
2968 if (allowApproxFunc(DAG, Flags)) {
2969 return IsExp10 ? lowerFEXP10Unsafe(X, SL, DAG, Flags)
2970 : lowerFEXPUnsafe(X, SL, DAG, Flags);
2971 }
2972
2973 // Algorithm:
2974 //
2975 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
2976 //
2977 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
2978 // n = 64*m + j, 0 <= j < 64
2979 //
2980 // e^x = 2^((64*m + j + f)/64)
2981 // = (2^m) * (2^(j/64)) * 2^(f/64)
2982 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
2983 //
2984 // f = x*(64/ln(2)) - n
2985 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
2986 //
2987 // e^x = (2^m) * (2^(j/64)) * e^r
2988 //
2989 // (2^(j/64)) is precomputed
2990 //
2991 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
2992 // e^r = 1 + q
2993 //
2994 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
2995 //
2996 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
2997 SDNodeFlags FlagsNoContract = Flags;
2998 FlagsNoContract.setAllowContract(false);
2999
3000 SDValue PH, PL;
3001 if (Subtarget->hasFastFMAF32()) {
3002 const float c_exp = numbers::log2ef;
3003 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
3004 const float c_exp10 = 0x1.a934f0p+1f;
3005 const float cc_exp10 = 0x1.2f346ep-24f;
3006
3007 SDValue C = DAG.getConstantFP(IsExp10 ? c_exp10 : c_exp, SL, VT);
3008 SDValue CC = DAG.getConstantFP(IsExp10 ? cc_exp10 : cc_exp, SL, VT);
3009
3010 PH = DAG.getNode(ISD::FMUL, SL, VT, X, C, Flags);
3011 SDValue NegPH = DAG.getNode(ISD::FNEG, SL, VT, PH, Flags);
3012 SDValue FMA0 = DAG.getNode(ISD::FMA, SL, VT, X, C, NegPH, Flags);
3013 PL = DAG.getNode(ISD::FMA, SL, VT, X, CC, FMA0, Flags);
3014 } else {
3015 const float ch_exp = 0x1.714000p+0f;
3016 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
3017
3018 const float ch_exp10 = 0x1.a92000p+1f;
3019 const float cl_exp10 = 0x1.4f0978p-11f;
3020
3021 SDValue CH = DAG.getConstantFP(IsExp10 ? ch_exp10 : ch_exp, SL, VT);
3022 SDValue CL = DAG.getConstantFP(IsExp10 ? cl_exp10 : cl_exp, SL, VT);
3023
3024 SDValue XAsInt = DAG.getNode(ISD::BITCAST, SL, MVT::i32, X);
3025 SDValue MaskConst = DAG.getConstant(0xfffff000, SL, MVT::i32);
3026 SDValue XHAsInt = DAG.getNode(ISD::AND, SL, MVT::i32, XAsInt, MaskConst);
3027 SDValue XH = DAG.getNode(ISD::BITCAST, SL, VT, XHAsInt);
3028 SDValue XL = DAG.getNode(ISD::FSUB, SL, VT, X, XH, Flags);
3029
3030 PH = DAG.getNode(ISD::FMUL, SL, VT, XH, CH, Flags);
3031
3032 SDValue XLCL = DAG.getNode(ISD::FMUL, SL, VT, XL, CL, Flags);
3033 SDValue Mad0 = getMad(DAG, SL, VT, XL, CH, XLCL, Flags);
3034 PL = getMad(DAG, SL, VT, XH, CL, Mad0, Flags);
3035 }
3036
3037 SDValue E = DAG.getNode(ISD::FROUNDEVEN, SL, VT, PH, Flags);
3038
3039 // It is unsafe to contract this fsub into the PH multiply.
3040 SDValue PHSubE = DAG.getNode(ISD::FSUB, SL, VT, PH, E, FlagsNoContract);
3041
3042 SDValue A = DAG.getNode(ISD::FADD, SL, VT, PHSubE, PL, Flags);
3043 SDValue IntE = DAG.getNode(ISD::FP_TO_SINT, SL, MVT::i32, E);
3044 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, A, Flags);
3045
3046 SDValue R = DAG.getNode(ISD::FLDEXP, SL, VT, Exp2, IntE, Flags);
3047
3048 SDValue UnderflowCheckConst =
3049 DAG.getConstantFP(IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f, SL, VT);
3050
3051 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
3052 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
3053 SDValue Underflow =
3054 DAG.getSetCC(SL, SetCCVT, X, UnderflowCheckConst, ISD::SETOLT);
3055
3056 R = DAG.getNode(ISD::SELECT, SL, VT, Underflow, Zero, R);
3057 const auto &Options = getTargetMachine().Options;
3058
3059 if (!Flags.hasNoInfs() && !Options.NoInfsFPMath) {
3060 SDValue OverflowCheckConst =
3061 DAG.getConstantFP(IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f, SL, VT);
3062 SDValue Overflow =
3063 DAG.getSetCC(SL, SetCCVT, X, OverflowCheckConst, ISD::SETOGT);
3064 SDValue Inf =
3066 R = DAG.getNode(ISD::SELECT, SL, VT, Overflow, Inf, R);
3067 }
3068
3069 return R;
3070}
3071
3072static bool isCtlzOpc(unsigned Opc) {
3073 return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
3074}
3075
3076static bool isCttzOpc(unsigned Opc) {
3077 return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF;
3078}
3079
3081 SelectionDAG &DAG) const {
3082 auto SL = SDLoc(Op);
3083 auto Arg = Op.getOperand(0u);
3084 auto ResultVT = Op.getValueType();
3085
3086 if (ResultVT != MVT::i8 && ResultVT != MVT::i16)
3087 return {};
3088
3089 assert(isCtlzOpc(Op.getOpcode()));
3090 assert(ResultVT == Arg.getValueType());
3091
3092 auto const LeadingZeroes = 32u - ResultVT.getFixedSizeInBits();
3093 auto SubVal = DAG.getConstant(LeadingZeroes, SL, MVT::i32);
3094 auto NewOp = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Arg);
3095 NewOp = DAG.getNode(Op.getOpcode(), SL, MVT::i32, NewOp);
3096 NewOp = DAG.getNode(ISD::SUB, SL, MVT::i32, NewOp, SubVal);
3097 return DAG.getNode(ISD::TRUNCATE, SL, ResultVT, NewOp);
3098}
3099
3101 SDLoc SL(Op);
3102 SDValue Src = Op.getOperand(0);
3103
3104 assert(isCtlzOpc(Op.getOpcode()) || isCttzOpc(Op.getOpcode()));
3105 bool Ctlz = isCtlzOpc(Op.getOpcode());
3106 unsigned NewOpc = Ctlz ? AMDGPUISD::FFBH_U32 : AMDGPUISD::FFBL_B32;
3107
3108 bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF ||
3109 Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF;
3110 bool Is64BitScalar = !Src->isDivergent() && Src.getValueType() == MVT::i64;
3111
3112 if (Src.getValueType() == MVT::i32 || Is64BitScalar) {
3113 // (ctlz hi:lo) -> (umin (ffbh src), 32)
3114 // (cttz hi:lo) -> (umin (ffbl src), 32)
3115 // (ctlz_zero_undef src) -> (ffbh src)
3116 // (cttz_zero_undef src) -> (ffbl src)
3117
3118 // 64-bit scalar version produce 32-bit result
3119 // (ctlz hi:lo) -> (umin (S_FLBIT_I32_B64 src), 64)
3120 // (cttz hi:lo) -> (umin (S_FF1_I32_B64 src), 64)
3121 // (ctlz_zero_undef src) -> (S_FLBIT_I32_B64 src)
3122 // (cttz_zero_undef src) -> (S_FF1_I32_B64 src)
3123 SDValue NewOpr = DAG.getNode(NewOpc, SL, MVT::i32, Src);
3124 if (!ZeroUndef) {
3125 const SDValue ConstVal = DAG.getConstant(
3126 Op.getValueType().getScalarSizeInBits(), SL, MVT::i32);
3127 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, ConstVal);
3128 }
3129 return DAG.getNode(ISD::ZERO_EXTEND, SL, Src.getValueType(), NewOpr);
3130 }
3131
3132 SDValue Lo, Hi;
3133 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3134
3135 SDValue OprLo = DAG.getNode(NewOpc, SL, MVT::i32, Lo);
3136 SDValue OprHi = DAG.getNode(NewOpc, SL, MVT::i32, Hi);
3137
3138 // (ctlz hi:lo) -> (umin3 (ffbh hi), (uaddsat (ffbh lo), 32), 64)
3139 // (cttz hi:lo) -> (umin3 (uaddsat (ffbl hi), 32), (ffbl lo), 64)
3140 // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
3141 // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
3142
3143 unsigned AddOpc = ZeroUndef ? ISD::ADD : ISD::UADDSAT;
3144 const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32);
3145 if (Ctlz)
3146 OprLo = DAG.getNode(AddOpc, SL, MVT::i32, OprLo, Const32);
3147 else
3148 OprHi = DAG.getNode(AddOpc, SL, MVT::i32, OprHi, Const32);
3149
3150 SDValue NewOpr;
3151 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, OprLo, OprHi);
3152 if (!ZeroUndef) {
3153 const SDValue Const64 = DAG.getConstant(64, SL, MVT::i32);
3154 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const64);
3155 }
3156
3157 return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr);
3158}
3159
3161 bool Signed) const {
3162 // The regular method converting a 64-bit integer to float roughly consists of
3163 // 2 steps: normalization and rounding. In fact, after normalization, the
3164 // conversion from a 64-bit integer to a float is essentially the same as the
3165 // one from a 32-bit integer. The only difference is that it has more
3166 // trailing bits to be rounded. To leverage the native 32-bit conversion, a
3167 // 64-bit integer could be preprocessed and fit into a 32-bit integer then
3168 // converted into the correct float number. The basic steps for the unsigned
3169 // conversion are illustrated in the following pseudo code:
3170 //
3171 // f32 uitofp(i64 u) {
3172 // i32 hi, lo = split(u);
3173 // // Only count the leading zeros in hi as we have native support of the
3174 // // conversion from i32 to f32. If hi is all 0s, the conversion is
3175 // // reduced to a 32-bit one automatically.
3176 // i32 shamt = clz(hi); // Return 32 if hi is all 0s.
3177 // u <<= shamt;
3178 // hi, lo = split(u);
3179 // hi |= (lo != 0) ? 1 : 0; // Adjust rounding bit in hi based on lo.
3180 // // convert it as a 32-bit integer and scale the result back.
3181 // return uitofp(hi) * 2^(32 - shamt);
3182 // }
3183 //
3184 // The signed one follows the same principle but uses 'ffbh_i32' to count its
3185 // sign bits instead. If 'ffbh_i32' is not available, its absolute value is
3186 // converted instead followed by negation based its sign bit.
3187
3188 SDLoc SL(Op);
3189 SDValue Src = Op.getOperand(0);
3190
3191 SDValue Lo, Hi;
3192 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3193 SDValue Sign;
3194 SDValue ShAmt;
3195 if (Signed && Subtarget->isGCN()) {
3196 // We also need to consider the sign bit in Lo if Hi has just sign bits,
3197 // i.e. Hi is 0 or -1. However, that only needs to take the MSB into
3198 // account. That is, the maximal shift is
3199 // - 32 if Lo and Hi have opposite signs;
3200 // - 33 if Lo and Hi have the same sign.
3201 //
3202 // Or, MaxShAmt = 33 + OppositeSign, where
3203 //
3204 // OppositeSign is defined as ((Lo ^ Hi) >> 31), which is
3205 // - -1 if Lo and Hi have opposite signs; and
3206 // - 0 otherwise.
3207 //
3208 // All in all, ShAmt is calculated as
3209 //
3210 // umin(sffbh(Hi), 33 + (Lo^Hi)>>31) - 1.
3211 //
3212 // or
3213 //
3214 // umin(sffbh(Hi) - 1, 32 + (Lo^Hi)>>31).
3215 //
3216 // to reduce the critical path.
3217 SDValue OppositeSign = DAG.getNode(
3218 ISD::SRA, SL, MVT::i32, DAG.getNode(ISD::XOR, SL, MVT::i32, Lo, Hi),
3219 DAG.getConstant(31, SL, MVT::i32));
3220 SDValue MaxShAmt =
3221 DAG.getNode(ISD::ADD, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
3222 OppositeSign);
3223 // Count the leading sign bits.
3224 ShAmt = DAG.getNode(AMDGPUISD::FFBH_I32, SL, MVT::i32, Hi);
3225 // Different from unsigned conversion, the shift should be one bit less to
3226 // preserve the sign bit.
3227 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, ShAmt,
3228 DAG.getConstant(1, SL, MVT::i32));
3229 ShAmt = DAG.getNode(ISD::UMIN, SL, MVT::i32, ShAmt, MaxShAmt);
3230 } else {
3231 if (Signed) {
3232 // Without 'ffbh_i32', only leading zeros could be counted. Take the
3233 // absolute value first.
3234 Sign = DAG.getNode(ISD::SRA, SL, MVT::i64, Src,
3235 DAG.getConstant(63, SL, MVT::i64));
3236 SDValue Abs =
3237 DAG.getNode(ISD::XOR, SL, MVT::i64,
3238 DAG.getNode(ISD::ADD, SL, MVT::i64, Src, Sign), Sign);
3239 std::tie(Lo, Hi) = split64BitValue(Abs, DAG);
3240 }
3241 // Count the leading zeros.
3242 ShAmt = DAG.getNode(ISD::CTLZ, SL, MVT::i32, Hi);
3243 // The shift amount for signed integers is [0, 32].
3244 }
3245 // Normalize the given 64-bit integer.
3246 SDValue Norm = DAG.getNode(ISD::SHL, SL, MVT::i64, Src, ShAmt);
3247 // Split it again.
3248 std::tie(Lo, Hi) = split64BitValue(Norm, DAG);
3249 // Calculate the adjust bit for rounding.
3250 // (lo != 0) ? 1 : 0 => (lo >= 1) ? 1 : 0 => umin(1, lo)
3251 SDValue Adjust = DAG.getNode(ISD::UMIN, SL, MVT::i32,
3252 DAG.getConstant(1, SL, MVT::i32), Lo);
3253 // Get the 32-bit normalized integer.
3254 Norm = DAG.getNode(ISD::OR, SL, MVT::i32, Hi, Adjust);
3255 // Convert the normalized 32-bit integer into f32.
3256 unsigned Opc =
3257 (Signed && Subtarget->isGCN()) ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
3258 SDValue FVal = DAG.getNode(Opc, SL, MVT::f32, Norm);
3259
3260 // Finally, need to scale back the converted floating number as the original
3261 // 64-bit integer is converted as a 32-bit one.
3262 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
3263 ShAmt);
3264 // On GCN, use LDEXP directly.
3265 if (Subtarget->isGCN())
3266 return DAG.getNode(ISD::FLDEXP, SL, MVT::f32, FVal, ShAmt);
3267
3268 // Otherwise, align 'ShAmt' to the exponent part and add it into the exponent
3269 // part directly to emulate the multiplication of 2^ShAmt. That 8-bit
3270 // exponent is enough to avoid overflowing into the sign bit.
3271 SDValue Exp = DAG.getNode(ISD::SHL, SL, MVT::i32, ShAmt,
3272 DAG.getConstant(23, SL, MVT::i32));
3273 SDValue IVal =
3274 DAG.getNode(ISD::ADD, SL, MVT::i32,
3275 DAG.getNode(ISD::BITCAST, SL, MVT::i32, FVal), Exp);
3276 if (Signed) {
3277 // Set the sign bit.
3278 Sign = DAG.getNode(ISD::SHL, SL, MVT::i32,
3279 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Sign),
3280 DAG.getConstant(31, SL, MVT::i32));
3281 IVal = DAG.getNode(ISD::OR, SL, MVT::i32, IVal, Sign);
3282 }
3283 return DAG.getNode(ISD::BITCAST, SL, MVT::f32, IVal);
3284}
3285
3287 bool Signed) const {
3288 SDLoc SL(Op);
3289 SDValue Src = Op.getOperand(0);
3290
3291 SDValue Lo, Hi;
3292 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3293
3295 SL, MVT::f64, Hi);
3296
3297 SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
3298
3299 SDValue LdExp = DAG.getNode(ISD::FLDEXP, SL, MVT::f64, CvtHi,
3300 DAG.getConstant(32, SL, MVT::i32));
3301 // TODO: Should this propagate fast-math-flags?
3302 return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
3303}
3304
3306 SelectionDAG &DAG) const {
3307 // TODO: Factor out code common with LowerSINT_TO_FP.
3308 EVT DestVT = Op.getValueType();
3309 SDValue Src = Op.getOperand(0);
3310 EVT SrcVT = Src.getValueType();
3311
3312 if (SrcVT == MVT::i16) {
3313 if (DestVT == MVT::f16)
3314 return Op;
3315 SDLoc DL(Op);
3316
3317 // Promote src to i32
3318 SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Src);
3319 return DAG.getNode(ISD::UINT_TO_FP, DL, DestVT, Ext);
3320 }
3321
3322 if (DestVT == MVT::bf16) {
3323 SDLoc SL(Op);
3324 SDValue ToF32 = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f32, Src);
3325 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true);
3326 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag);
3327 }
3328
3329 if (SrcVT != MVT::i64)
3330 return Op;
3331
3332 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
3333 SDLoc DL(Op);
3334
3335 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
3336 SDValue FPRoundFlag =
3337 DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);
3338 SDValue FPRound =
3339 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
3340
3341 return FPRound;
3342 }
3343
3344 if (DestVT == MVT::f32)
3345 return LowerINT_TO_FP32(Op, DAG, false);
3346
3347 assert(DestVT == MVT::f64);
3348 return LowerINT_TO_FP64(Op, DAG, false);
3349}
3350
3352 SelectionDAG &DAG) const {
3353 EVT DestVT = Op.getValueType();
3354
3355 SDValue Src = Op.getOperand(0);
3356 EVT SrcVT = Src.getValueType();
3357
3358 if (SrcVT == MVT::i16) {
3359 if (DestVT == MVT::f16)
3360 return Op;
3361
3362 SDLoc DL(Op);
3363 // Promote src to i32
3364 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, Src);
3365 return DAG.getNode(ISD::SINT_TO_FP, DL, DestVT, Ext);
3366 }
3367
3368 if (DestVT == MVT::bf16) {
3369 SDLoc SL(Op);
3370 SDValue ToF32 = DAG.getNode(ISD::SINT_TO_FP, SL, MVT::f32, Src);
3371 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true);
3372 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag);
3373 }
3374
3375 if (SrcVT != MVT::i64)
3376 return Op;
3377
3378 // TODO: Factor out code common with LowerUINT_TO_FP.
3379
3380 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
3381 SDLoc DL(Op);
3382 SDValue Src = Op.getOperand(0);
3383
3384 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
3385 SDValue FPRoundFlag =
3386 DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);
3387 SDValue FPRound =
3388 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
3389
3390 return FPRound;
3391 }
3392
3393 if (DestVT == MVT::f32)
3394 return LowerINT_TO_FP32(Op, DAG, true);
3395
3396 assert(DestVT == MVT::f64);
3397 return LowerINT_TO_FP64(Op, DAG, true);
3398}
3399
3401 bool Signed) const {
3402 SDLoc SL(Op);
3403
3404 SDValue Src = Op.getOperand(0);
3405 EVT SrcVT = Src.getValueType();
3406
3407 assert(SrcVT == MVT::f32 || SrcVT == MVT::f64);
3408
3409 // The basic idea of converting a floating point number into a pair of 32-bit
3410 // integers is illustrated as follows:
3411 //
3412 // tf := trunc(val);
3413 // hif := floor(tf * 2^-32);
3414 // lof := tf - hif * 2^32; // lof is always positive due to floor.
3415 // hi := fptoi(hif);
3416 // lo := fptoi(lof);
3417 //
3418 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, SrcVT, Src);
3419 SDValue Sign;
3420 if (Signed && SrcVT == MVT::f32) {
3421 // However, a 32-bit floating point number has only 23 bits mantissa and
3422 // it's not enough to hold all the significant bits of `lof` if val is
3423 // negative. To avoid the loss of precision, We need to take the absolute
3424 // value after truncating and flip the result back based on the original
3425 // signedness.
3426 Sign = DAG.getNode(ISD::SRA, SL, MVT::i32,
3427 DAG.getNode(ISD::BITCAST, SL, MVT::i32, Trunc),
3428 DAG.getConstant(31, SL, MVT::i32));
3429 Trunc = DAG.getNode(ISD::FABS, SL, SrcVT, Trunc);
3430 }
3431
3432 SDValue K0, K1;
3433 if (SrcVT == MVT::f64) {
3434 K0 = DAG.getConstantFP(
3435 llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)), SL,
3436 SrcVT);
3437 K1 = DAG.getConstantFP(
3438 llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)), SL,
3439 SrcVT);
3440 } else {
3441 K0 = DAG.getConstantFP(
3442 llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)), SL, SrcVT);
3443 K1 = DAG.getConstantFP(
3444 llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)), SL, SrcVT);
3445 }
3446 // TODO: Should this propagate fast-math-flags?
3447 SDValue Mul = DAG.getNode(ISD::FMUL, SL, SrcVT, Trunc, K0);
3448
3449 SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, SrcVT, Mul);
3450
3451 SDValue Fma = DAG.getNode(ISD::FMA, SL, SrcVT, FloorMul, K1, Trunc);
3452
3453 SDValue Hi = DAG.getNode((Signed && SrcVT == MVT::f64) ? ISD::FP_TO_SINT
3455 SL, MVT::i32, FloorMul);
3456 SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
3457
3458 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
3459 DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi}));
3460
3461 if (Signed && SrcVT == MVT::f32) {
3462 assert(Sign);
3463 // Flip the result based on the signedness, which is either all 0s or 1s.
3464 Sign = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
3465 DAG.getBuildVector(MVT::v2i32, SL, {Sign, Sign}));
3466 // r := xor(r, sign) - sign;
3467 Result =
3468 DAG.getNode(ISD::SUB, SL, MVT::i64,
3469 DAG.getNode(ISD::XOR, SL, MVT::i64, Result, Sign), Sign);
3470 }
3471
3472 return Result;
3473}
3474
3476 SDLoc DL(Op);
3477 SDValue N0 = Op.getOperand(0);
3478
3479 // Convert to target node to get known bits
3480 if (N0.getValueType() == MVT::f32)
3481 return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0);
3482
3483 if (getTargetMachine().Options.UnsafeFPMath) {
3484 // There is a generic expand for FP_TO_FP16 with unsafe fast math.
3485 return SDValue();
3486 }
3487
3488 assert(N0.getSimpleValueType() == MVT::f64);
3489
3490 // f64 -> f16 conversion using round-to-nearest-even rounding mode.
3491 const unsigned ExpMask = 0x7ff;
3492 const unsigned ExpBiasf64 = 1023;
3493 const unsigned ExpBiasf16 = 15;
3494 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
3495 SDValue One = DAG.getConstant(1, DL, MVT::i32);
3496 SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, N0);
3497 SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U,
3498 DAG.getConstant(32, DL, MVT::i64));
3499 UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32);
3500 U = DAG.getZExtOrTrunc(U, DL, MVT::i32);
3501 SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3502 DAG.getConstant(20, DL, MVT::i64));
3503 E = DAG.getNode(ISD::AND, DL, MVT::i32, E,
3504 DAG.getConstant(ExpMask, DL, MVT::i32));
3505 // Subtract the fp64 exponent bias (1023) to get the real exponent and
3506 // add the f16 bias (15) to get the biased exponent for the f16 format.
3507 E = DAG.getNode(ISD::ADD, DL, MVT::i32, E,
3508 DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32));
3509
3510 SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3511 DAG.getConstant(8, DL, MVT::i32));
3512 M = DAG.getNode(ISD::AND, DL, MVT::i32, M,
3513 DAG.getConstant(0xffe, DL, MVT::i32));
3514
3515 SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH,
3516 DAG.getConstant(0x1ff, DL, MVT::i32));
3517 MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U);
3518
3519 SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ);
3520 M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set);
3521
3522 // (M != 0 ? 0x0200 : 0) | 0x7c00;
3523 SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32,
3524 DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32),
3525 Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32));
3526
3527 // N = M | (E << 12);
3528 SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M,
3529 DAG.getNode(ISD::SHL, DL, MVT::i32, E,
3530 DAG.getConstant(12, DL, MVT::i32)));
3531
3532 // B = clamp(1-E, 0, 13);
3533 SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32,
3534 One, E);
3535 SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero);
3536 B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B,
3537 DAG.getConstant(13, DL, MVT::i32));
3538
3539 SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M,
3540 DAG.getConstant(0x1000, DL, MVT::i32));
3541
3542 SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B);
3543 SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B);
3544 SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE);
3545 D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1);
3546
3547 SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT);
3548 SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V,
3549 DAG.getConstant(0x7, DL, MVT::i32));
3550 V = DAG.getNode(ISD::SRL, DL, MVT::i32, V,
3551 DAG.getConstant(2, DL, MVT::i32));
3552 SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32),
3553 One, Zero, ISD::SETEQ);
3554 SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32),
3555 One, Zero, ISD::SETGT);
3556 V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1);
3557 V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1);
3558
3559 V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32),
3560 DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT);
3561 V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32),
3562 I, V, ISD::SETEQ);
3563
3564 // Extract the sign bit.
3565 SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3566 DAG.getConstant(16, DL, MVT::i32));
3567 Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign,
3568 DAG.getConstant(0x8000, DL, MVT::i32));
3569
3570 V = DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V);
3571 return DAG.getZExtOrTrunc(V, DL, Op.getValueType());
3572}
3573
3575 SelectionDAG &DAG) const {
3576 SDValue Src = Op.getOperand(0);
3577 unsigned OpOpcode = Op.getOpcode();
3578 EVT SrcVT = Src.getValueType();
3579 EVT DestVT = Op.getValueType();
3580
3581 // Will be selected natively
3582 if (SrcVT == MVT::f16 && DestVT == MVT::i16)
3583 return Op;
3584
3585 if (SrcVT == MVT::bf16) {
3586 SDLoc DL(Op);
3587 SDValue PromotedSrc = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
3588 return DAG.getNode(Op.getOpcode(), DL, DestVT, PromotedSrc);
3589 }
3590
3591 // Promote i16 to i32
3592 if (DestVT == MVT::i16 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
3593 SDLoc DL(Op);
3594
3595 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
3596 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToInt32);
3597 }
3598
3599 if (DestVT != MVT::i64)
3600 return Op;
3601
3602 if (SrcVT == MVT::f16 ||
3603 (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {
3604 SDLoc DL(Op);
3605
3606 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
3607 unsigned Ext =
3609 return DAG.getNode(Ext, DL, MVT::i64, FpToInt32);
3610 }
3611
3612 if (SrcVT == MVT::f32 || SrcVT == MVT::f64)
3613 return LowerFP_TO_INT64(Op, DAG, OpOpcode == ISD::FP_TO_SINT);
3614
3615 return SDValue();
3616}
3617
3619 SelectionDAG &DAG) const {
3620 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
3621 MVT VT = Op.getSimpleValueType();
3622 MVT ScalarVT = VT.getScalarType();
3623
3624 assert(VT.isVector());
3625
3626 SDValue Src = Op.getOperand(0);
3627 SDLoc DL(Op);
3628
3629 // TODO: Don't scalarize on Evergreen?
3630 unsigned NElts = VT.getVectorNumElements();
3632 DAG.ExtractVectorElements(Src, Args, 0, NElts);
3633
3634 SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
3635 for (unsigned I = 0; I < NElts; ++I)
3636 Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
3637
3638 return DAG.getBuildVector(VT, DL, Args);
3639}
3640
3641//===----------------------------------------------------------------------===//
3642// Custom DAG optimizations
3643//===----------------------------------------------------------------------===//
3644
3645static bool isU24(SDValue Op, SelectionDAG &DAG) {
3646 return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24;
3647}
3648
3649static bool isI24(SDValue Op, SelectionDAG &DAG) {
3650 EVT VT = Op.getValueType();
3651 return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
3652 // as unsigned 24-bit values.
3654}
3655
3658 SelectionDAG &DAG = DCI.DAG;
3659 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
3660 bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
3661
3662 SDValue LHS = IsIntrin ? Node24->getOperand(1) : Node24->getOperand(0);
3663 SDValue RHS = IsIntrin ? Node24->getOperand(2) : Node24->getOperand(1);
3664 unsigned NewOpcode = Node24->getOpcode();
3665 if (IsIntrin) {
3666 unsigned IID = Node24->getConstantOperandVal(0);
3667 switch (IID) {
3668 case Intrinsic::amdgcn_mul_i24:
3669 NewOpcode = AMDGPUISD::MUL_I24;
3670 break;
3671 case Intrinsic::amdgcn_mul_u24:
3672 NewOpcode = AMDGPUISD::MUL_U24;
3673 break;
3674 case Intrinsic::amdgcn_mulhi_i24:
3675 NewOpcode = AMDGPUISD::MULHI_I24;
3676 break;
3677 case Intrinsic::amdgcn_mulhi_u24:
3678 NewOpcode = AMDGPUISD::MULHI_U24;
3679 break;
3680 default:
3681 llvm_unreachable("Expected 24-bit mul intrinsic");
3682 }
3683 }
3684
3685 APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24);
3686
3687 // First try to simplify using SimplifyMultipleUseDemandedBits which allows
3688 // the operands to have other uses, but will only perform simplifications that
3689 // involve bypassing some nodes for this user.
3690 SDValue DemandedLHS = TLI.SimplifyMultipleUseDemandedBits(LHS, Demanded, DAG);
3691 SDValue DemandedRHS = TLI.SimplifyMultipleUseDemandedBits(RHS, Demanded, DAG);
3692 if (DemandedLHS || DemandedRHS)
3693 return DAG.getNode(NewOpcode, SDLoc(Node24), Node24->getVTList(),
3694 DemandedLHS ? DemandedLHS : LHS,
3695 DemandedRHS ? DemandedRHS : RHS);
3696
3697 // Now try SimplifyDemandedBits which can simplify the nodes used by our
3698 // operands if this node is the only user.
3699 if (TLI.SimplifyDemandedBits(LHS, Demanded, DCI))
3700 return SDValue(Node24, 0);
3701 if (TLI.SimplifyDemandedBits(RHS, Demanded, DCI))
3702 return SDValue(Node24, 0);
3703
3704 return SDValue();
3705}
3706
3707template <typename IntTy>
3709 uint32_t Width, const SDLoc &DL) {
3710 if (Width + Offset < 32) {
3711 uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
3712 IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
3713 return DAG.getConstant(Result, DL, MVT::i32);
3714 }
3715
3716 return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
3717}
3718
3719static bool hasVolatileUser(SDNode *Val) {
3720 for (SDNode *U : Val->uses()) {
3721 if (MemSDNode *M = dyn_cast<MemSDNode>(U)) {
3722 if (M->isVolatile())
3723 return true;
3724 }
3725 }
3726
3727 return false;
3728}
3729
3731 // i32 vectors are the canonical memory type.
3732 if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
3733 return false;
3734
3735 if (!VT.isByteSized())
3736 return false;
3737
3738 unsigned Size = VT.getStoreSize();
3739
3740 if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
3741 return false;
3742
3743 if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
3744 return false;
3745
3746 return true;
3747}
3748
3749// Replace load of an illegal type with a store of a bitcast to a friendlier
3750// type.
3752 DAGCombinerInfo &DCI) const {
3753 if (!DCI.isBeforeLegalize())
3754 return SDValue();
3755
3756 LoadSDNode *LN = cast<LoadSDNode>(N);
3757 if (!LN->isSimple() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
3758 return SDValue();
3759
3760 SDLoc SL(N);
3761 SelectionDAG &DAG = DCI.DAG;
3762 EVT VT = LN->getMemoryVT();
3763
3764 unsigned Size = VT.getStoreSize();
3765 Align Alignment = LN->getAlign();
3766 if (Alignment < Size && isTypeLegal(VT)) {
3767 unsigned IsFast;
3768 unsigned AS = LN->getAddressSpace();
3769
3770 // Expand unaligned loads earlier than legalization. Due to visitation order
3771 // problems during legalization, the emitted instructions to pack and unpack
3772 // the bytes again are not eliminated in the case of an unaligned copy.
3774 VT, AS, Alignment, LN->getMemOperand()->getFlags(), &IsFast)) {
3775 if (VT.isVector())
3776 return SplitVectorLoad(SDValue(LN, 0), DAG);
3777
3778 SDValue Ops[2];
3779 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
3780
3781 return DAG.getMergeValues(Ops, SDLoc(N));
3782 }
3783
3784 if (!IsFast)
3785 return SDValue();
3786 }
3787
3788 if (!shouldCombineMemoryType(VT))
3789 return SDValue();
3790
3791 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3792
3793 SDValue NewLoad
3794 = DAG.getLoad(NewVT, SL, LN->getChain(),
3795 LN->getBasePtr(), LN->getMemOperand());
3796
3797 SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad);
3798 DCI.CombineTo(N, BC, NewLoad.getValue(1));
3799 return SDValue(N, 0);
3800}
3801
3802// Replace store of an illegal type with a store of a bitcast to a friendlier
3803// type.
3805 DAGCombinerInfo &DCI) const {
3806 if (!DCI.isBeforeLegalize())
3807 return SDValue();
3808
3809 StoreSDNode *SN = cast<StoreSDNode>(N);
3810 if (!SN->isSimple() || !ISD::isNormalStore(SN))
3811 return SDValue();
3812
3813 EVT VT = SN->getMemoryVT();
3814 unsigned Size = VT.getStoreSize();
3815
3816 SDLoc SL(N);
3817 SelectionDAG &DAG = DCI.DAG;
3818 Align Alignment = SN->getAlign();
3819 if (Alignment < Size && isTypeLegal(VT)) {
3820 unsigned IsFast;
3821 unsigned AS = SN->getAddressSpace();
3822
3823 // Expand unaligned stores earlier than legalization. Due to visitation
3824 // order problems during legalization, the emitted instructions to pack and
3825 // unpack the bytes again are not eliminated in the case of an unaligned
3826 // copy.
3828 VT, AS, Alignment, SN->getMemOperand()->getFlags(), &IsFast)) {
3829 if (VT.isVector())
3830 return SplitVectorStore(SDValue(SN, 0), DAG);
3831
3832 return expandUnalignedStore(SN, DAG);
3833 }
3834
3835 if (!IsFast)
3836 return SDValue();
3837 }
3838
3839 if (!shouldCombineMemoryType(VT))
3840 return SDValue();
3841
3842 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3843 SDValue Val = SN->getValue();
3844
3845 //DCI.AddToWorklist(Val.getNode());
3846
3847 bool OtherUses = !Val.hasOneUse();
3848 SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val);
3849 if (OtherUses) {
3850 SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal);
3851 DAG.ReplaceAllUsesOfValueWith(Val, CastBack);
3852 }
3853
3854 return DAG.getStore(SN->getChain(), SL, CastVal,
3855 SN->getBasePtr(), SN->getMemOperand());
3856}
3857
3858// FIXME: This should go in generic DAG combiner with an isTruncateFree check,
3859// but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
3860// issues.
3862 DAGCombinerInfo &DCI) const {
3863 SelectionDAG &DAG = DCI.DAG;
3864 SDValue N0 = N->getOperand(0);
3865
3866 // (vt2 (assertzext (truncate vt0:x), vt1)) ->
3867 // (vt2 (truncate (assertzext vt0:x, vt1)))
3868 if (N0.getOpcode() == ISD::TRUNCATE) {
3869 SDValue N1 = N->getOperand(1);
3870 EVT ExtVT = cast<VTSDNode>(N1)->getVT();
3871 SDLoc SL(N);
3872
3873 SDValue Src = N0.getOperand(0);
3874 EVT SrcVT = Src.getValueType();
3875 if (SrcVT.bitsGE(ExtVT)) {
3876 SDValue NewInReg = DAG.getNode(N->getOpcode(), SL, SrcVT, Src, N1);
3877 return DAG.getNode(ISD::TRUNCATE, SL, N->getValueType(0), NewInReg);
3878 }
3879 }
3880
3881 return SDValue();
3882}
3883
3885 SDNode *N, DAGCombinerInfo &DCI) const {
3886 unsigned IID = N->getConstantOperandVal(0);
3887 switch (IID) {
3888 case Intrinsic::amdgcn_mul_i24:
3889 case Intrinsic::amdgcn_mul_u24:
3890 case Intrinsic::amdgcn_mulhi_i24:
3891 case Intrinsic::amdgcn_mulhi_u24:
3892 return simplifyMul24(N, DCI);
3893 case Intrinsic::amdgcn_fract:
3894 case Intrinsic::amdgcn_rsq:
3895 case Intrinsic::amdgcn_rcp_legacy:
3896 case Intrinsic::amdgcn_rsq_legacy:
3897 case Intrinsic::amdgcn_rsq_clamp: {
3898 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
3899 SDValue Src = N->getOperand(1);
3900 return Src.isUndef() ? Src : SDValue();
3901 }
3902 case Intrinsic::amdgcn_frexp_exp: {
3903 // frexp_exp (fneg x) -> frexp_exp x
3904 // frexp_exp (fabs x) -> frexp_exp x
3905 // frexp_exp (fneg (fabs x)) -> frexp_exp x
3906 SDValue Src = N->getOperand(1);
3907 SDValue PeekSign = peekFPSignOps(Src);
3908 if (PeekSign == Src)
3909 return SDValue();
3910 return SDValue(DCI.DAG.UpdateNodeOperands(N, N->getOperand(0), PeekSign),
3911 0);
3912 }
3913 default:
3914 return SDValue();
3915 }
3916}
3917
3918/// Split the 64-bit value \p LHS into two 32-bit components, and perform the
3919/// binary operation \p Opc to it with the corresponding constant operands.
3921 DAGCombinerInfo &DCI, const SDLoc &SL,
3922 unsigned Opc, SDValue LHS,
3923 uint32_t ValLo, uint32_t ValHi) const {
3924 SelectionDAG &DAG = DCI.DAG;
3925 SDValue Lo, Hi;
3926 std::tie(Lo, Hi) = split64BitValue(LHS, DAG);
3927
3928 SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32);
3929 SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32);
3930
3931 SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS);
3932 SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS);
3933
3934 // Re-visit the ands. It's possible we eliminated one of them and it could
3935 // simplify the vector.
3936 DCI.AddToWorklist(Lo.getNode());
3937 DCI.AddToWorklist(Hi.getNode());
3938
3939 SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd});
3940 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
3941}
3942
3944 DAGCombinerInfo &DCI) const {
3945 EVT VT = N->getValueType(0);
3946
3947 ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3948 if (!RHS)
3949 return SDValue();
3950
3951 SDValue LHS = N->getOperand(0);
3952 unsigned RHSVal = RHS->getZExtValue();
3953 if (!RHSVal)
3954 return LHS;
3955
3956 SDLoc SL(N);
3957 SelectionDAG &DAG = DCI.DAG;
3958
3959 switch (LHS->getOpcode()) {
3960 default:
3961 break;
3962 case ISD::ZERO_EXTEND:
3963 case ISD::SIGN_EXTEND:
3964 case ISD::ANY_EXTEND: {
3965 SDValue X = LHS->getOperand(0);
3966
3967 if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 &&
3968 isOperationLegal(ISD::BUILD_VECTOR, MVT::v2i16)) {
3969 // Prefer build_vector as the canonical form if packed types are legal.
3970 // (shl ([asz]ext i16:x), 16 -> build_vector 0, x
3971 SDValue Vec = DAG.getBuildVector(MVT::v2i16, SL,
3972 { DAG.getConstant(0, SL, MVT::i16), LHS->getOperand(0) });
3973 return DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
3974 }
3975
3976 // shl (ext x) => zext (shl x), if shift does not overflow int
3977 if (VT != MVT::i64)
3978 break;
3979 KnownBits Known = DAG.computeKnownBits(X);
3980 unsigned LZ = Known.countMinLeadingZeros();
3981 if (LZ < RHSVal)
3982 break;
3983 EVT XVT = X.getValueType();
3984 SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0));
3985 return DAG.getZExtOrTrunc(Shl, SL, VT);
3986 }
3987 }
3988
3989 if (VT != MVT::i64)
3990 return SDValue();
3991
3992 // i64 (shl x, C) -> (build_pair 0, (shl x, C -32))
3993
3994 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
3995 // common case, splitting this into a move and a 32-bit shift is faster and
3996 // the same code size.
3997 if (RHSVal < 32)
3998 return SDValue();
3999
4000 SDValue ShiftAmt = DAG.getConstant(RHSVal - 32, SL, MVT::i32);
4001
4002 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
4003 SDValue NewShift = DAG.getNode(ISD::SHL, SL, MVT::i32, Lo, ShiftAmt);
4004
4005 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
4006
4007 SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {Zero, NewShift});
4008 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
4009}
4010
4012 DAGCombinerInfo &DCI) const {
4013 if (N->getValueType(0) != MVT::i64)
4014 return SDValue();
4015
4016 const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
4017 if (!RHS)
4018 return SDValue();
4019
4020 SelectionDAG &DAG = DCI.DAG;
4021 SDLoc SL(N);
4022 unsigned RHSVal = RHS->getZExtValue();
4023
4024 // (sra i64:x, 32) -> build_pair x, (sra hi_32(x), 31)
4025 if (RHSVal == 32) {
4026 SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
4027 SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
4028 DAG.getConstant(31, SL, MVT::i32));
4029
4030 SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {Hi, NewShift});
4031 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
4032 }
4033
4034 // (sra i64:x, 63) -> build_pair (sra hi_32(x), 31), (sra hi_32(x), 31)
4035 if (RHSVal == 63) {
4036 SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
4037 SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
4038 DAG.getConstant(31, SL, MVT::i32));
4039 SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, NewShift});
4040 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
4041 }
4042
4043 return SDValue();
4044}
4045
4047 DAGCombinerInfo &DCI) const {
4048 auto *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
4049 if (!RHS)
4050 return SDValue();
4051
4052 EVT VT = N->getValueType(0);
4053 SDValue LHS = N->getOperand(0);
4054 unsigned ShiftAmt = RHS->getZExtValue();
4055 SelectionDAG &DAG = DCI.DAG;
4056 SDLoc SL(N);
4057
4058 // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1)
4059 // this improves the ability to match BFE patterns in isel.
4060 if (LHS.getOpcode() == ISD::AND) {
4061 if (auto *Mask = dyn_cast<ConstantSDNode>(LHS.getOperand(1))) {
4062 unsigned MaskIdx, MaskLen;
4063 if (Mask->getAPIntValue().isShiftedMask(MaskIdx, MaskLen) &&
4064 MaskIdx == ShiftAmt) {
4065 return DAG.getNode(
4066 ISD::AND, SL, VT,
4067 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0), N->getOperand(1)),
4068 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(1), N->getOperand(1)));
4069 }
4070 }
4071 }
4072
4073 if (VT != MVT::i64)
4074 return SDValue();
4075
4076 if (ShiftAmt < 32)
4077 return SDValue();
4078
4079 // srl i64:x, C for C >= 32
4080 // =>
4081 // build_pair (srl hi_32(x), C - 32), 0
4082 SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
4083
4084 SDValue Hi = getHiHalf64(LHS, DAG);
4085
4086 SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32);
4087 SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst);
4088
4089 SDValue BuildPair = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, Zero});
4090
4091 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair);
4092}
4093
4095 SDNode *N, DAGCombinerInfo &DCI) const {
4096 SDLoc SL(N);
4097 SelectionDAG &DAG = DCI.DAG;
4098 EVT VT = N->getValueType(0);
4099 SDValue Src = N->getOperand(0);
4100
4101 // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x)
4102 if (Src.getOpcode() == ISD::BITCAST && !VT.isVector()) {
4103 SDValue Vec = Src.getOperand(0);
4104 if (Vec.getOpcode() == ISD::BUILD_VECTOR) {
4105 SDValue Elt0 = Vec.getOperand(0);
4106 EVT EltVT = Elt0.getValueType();
4107 if (VT.getFixedSizeInBits() <= EltVT.getFixedSizeInBits()) {
4108 if (EltVT.isFloatingPoint()) {
4109 Elt0 = DAG.getNode(ISD::BITCAST, SL,
4110 EltVT.changeTypeToInteger(), Elt0);
4111 }
4112
4113 return DAG.getNode(ISD::TRUNCATE, SL, VT, Elt0);
4114 }
4115 }
4116 }
4117
4118 // Equivalent of above for accessing the high element of a vector as an
4119 // integer operation.
4120 // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)
4121 if (Src.getOpcode() == ISD::SRL && !VT.isVector()) {
4122 if (auto K = isConstOrConstSplat(Src.getOperand(1))) {
4123 if (2 * K->getZExtValue() == Src.getValueType().getScalarSizeInBits()) {
4124 SDValue BV = stripBitcast(Src.getOperand(0));
4125 if (BV.getOpcode() == ISD::BUILD_VECTOR &&
4126 BV.getValueType().getVectorNumElements() == 2) {
4127 SDValue SrcElt = BV.getOperand(1);
4128 EVT SrcEltVT = SrcElt.getValueType();
4129 if (SrcEltVT.isFloatingPoint()) {
4130 SrcElt = DAG.getNode(ISD::BITCAST, SL,
4131 SrcEltVT.changeTypeToInteger(), SrcElt);
4132 }
4133
4134 return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt);
4135 }
4136 }
4137 }
4138 }
4139
4140 // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.
4141 //
4142 // i16 (trunc (srl i64:x, K)), K <= 16 ->
4143 // i16 (trunc (srl (i32 (trunc x), K)))
4144 if (VT.getScalarSizeInBits() < 32) {
4145 EVT SrcVT = Src.getValueType();
4146 if (SrcVT.getScalarSizeInBits() > 32 &&
4147 (Src.getOpcode() == ISD::SRL ||
4148 Src.getOpcode() == ISD::SRA ||
4149 Src.getOpcode() == ISD::SHL)) {
4150 SDValue Amt = Src.getOperand(1);
4151 KnownBits Known = DAG.computeKnownBits(Amt);
4152
4153 // - For left shifts, do the transform as long as the shift
4154 // amount is still legal for i32, so when ShiftAmt < 32 (<= 31)
4155 // - For right shift, do it if ShiftAmt <= (32 - Size) to avoid
4156 // losing information stored in the high bits when truncating.
4157 const unsigned MaxCstSize =
4158 (Src.getOpcode() == ISD::SHL) ? 31 : (32 - VT.getScalarSizeInBits());
4159 if (Known.getMaxValue().ule(MaxCstSize)) {
4160 EVT MidVT = VT.isVector() ?
4161 EVT::getVectorVT(*DAG.getContext(), MVT::i32,
4162 VT.getVectorNumElements()) : MVT::i32;
4163
4164 EVT NewShiftVT = getShiftAmountTy(MidVT, DAG.getDataLayout());
4165 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MidVT,
4166 Src.getOperand(0));
4167 DCI.AddToWorklist(Trunc.getNode());
4168
4169 if (Amt.getValueType() != NewShiftVT) {
4170 Amt = DAG.getZExtOrTrunc(Amt, SL, NewShiftVT);
4171 DCI.AddToWorklist(Amt.getNode());
4172 }
4173
4174 SDValue ShrunkShift = DAG.getNode(Src.getOpcode(), SL, MidVT,
4175 Trunc, Amt);
4176 return DAG.getNode(ISD::TRUNCATE, SL, VT, ShrunkShift);
4177 }
4178 }
4179 }
4180
4181 return SDValue();
4182}
4183
4184// We need to specifically handle i64 mul here to avoid unnecessary conversion
4185// instructions. If we only match on the legalized i64 mul expansion,
4186// SimplifyDemandedBits will be unable to remove them because there will be
4187// multiple uses due to the separate mul + mulh[su].
4188static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
4189 SDValue N0, SDValue N1, unsigned Size, bool Signed) {
4190 if (Size <= 32) {
4191 unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
4192 return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1);
4193 }
4194
4195 unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
4196 unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
4197
4198 SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1);
4199 SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1);
4200
4201 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, MulLo, MulHi);
4202}
4203
4204/// If \p V is an add of a constant 1, returns the other operand. Otherwise
4205/// return SDValue().
4206static SDValue getAddOneOp(const SDNode *V) {
4207 if (V->getOpcode() != ISD::ADD)
4208 return SDValue();
4209
4210 return isOneConstant(V->getOperand(1)) ? V->getOperand(0) : SDValue();
4211}
4212
4214 DAGCombinerInfo &DCI) const {
4215 assert(N->getOpcode() == ISD::MUL);
4216 EVT VT = N->getValueType(0);
4217
4218 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4219 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4220 // unnecessarily). isDivergent() is used as an approximation of whether the
4221 // value is in an SGPR.
4222 if (!N->isDivergent())
4223 return SDValue();
4224
4225 unsigned Size = VT.getSizeInBits();
4226 if (VT.isVector() || Size > 64)
4227 return SDValue();
4228
4229 SelectionDAG &DAG = DCI.DAG;
4230 SDLoc DL(N);
4231
4232 SDValue N0 = N->getOperand(0);
4233 SDValue N1 = N->getOperand(1);
4234
4235 // Undo InstCombine canonicalize X * (Y + 1) -> X * Y + X to enable mad
4236 // matching.
4237
4238 // mul x, (add y, 1) -> add (mul x, y), x
4239 auto IsFoldableAdd = [](SDValue V) -> SDValue {
4240 SDValue AddOp = getAddOneOp(V.getNode());
4241 if (!AddOp)
4242 return SDValue();
4243
4244 if (V.hasOneUse() || all_of(V->uses(), [](const SDNode *U) -> bool {
4245 return U->getOpcode() == ISD::MUL;
4246 }))
4247 return AddOp;
4248
4249 return SDValue();
4250 };
4251
4252 // FIXME: The selection pattern is not properly checking for commuted
4253 // operands, so we have to place the mul in the LHS
4254 if (SDValue MulOper = IsFoldableAdd(N0)) {
4255 SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N1, MulOper);
4256 return DAG.getNode(ISD::ADD, DL, VT, MulVal, N1);
4257 }
4258
4259 if (SDValue MulOper = IsFoldableAdd(N1)) {
4260 SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N0, MulOper);
4261 return DAG.getNode(ISD::ADD, DL, VT, MulVal, N0);
4262 }
4263
4264 // There are i16 integer mul/mad.
4265 if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16))
4266 return SDValue();
4267
4268 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
4269 // in the source into any_extends if the result of the mul is truncated. Since
4270 // we can assume the high bits are whatever we want, use the underlying value
4271 // to avoid the unknown high bits from interfering.
4272 if (N0.getOpcode() == ISD::ANY_EXTEND)
4273 N0 = N0.getOperand(0);
4274
4275 if (N1.getOpcode() == ISD::ANY_EXTEND)
4276 N1 = N1.getOperand(0);
4277
4278 SDValue Mul;
4279
4280 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
4281 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4282 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4283 Mul = getMul24(DAG, DL, N0, N1, Size, false);
4284 } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
4285 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4286 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4287 Mul = getMul24(DAG, DL, N0, N1, Size, true);
4288 } else {
4289 return SDValue();
4290 }
4291
4292 // We need to use sext even for MUL_U24, because MUL_U24 is used
4293 // for signed multiply of 8 and 16-bit types.
4294 return DAG.getSExtOrTrunc(Mul, DL, VT);
4295}
4296
4297SDValue
4299 DAGCombinerInfo &DCI) const {
4300 if (N->getValueType(0) != MVT::i32)
4301 return SDValue();
4302
4303 SelectionDAG &DAG = DCI.DAG;
4304 SDLoc DL(N);
4305
4306 SDValue N0 = N->getOperand(0);
4307 SDValue N1 = N->getOperand(1);
4308
4309 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
4310 // in the source into any_extends if the result of the mul is truncated. Since
4311 // we can assume the high bits are whatever we want, use the underlying value
4312 // to avoid the unknown high bits from interfering.
4313 if (N0.getOpcode() == ISD::ANY_EXTEND)
4314 N0 = N0.getOperand(0);
4315 if (N1.getOpcode() == ISD::ANY_EXTEND)
4316 N1 = N1.getOperand(0);
4317
4318 // Try to use two fast 24-bit multiplies (one for each half of the result)
4319 // instead of one slow extending multiply.
4320 unsigned LoOpcode, HiOpcode;
4321 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
4322 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4323 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4324 LoOpcode = AMDGPUISD::MUL_U24;
4325 HiOpcode = AMDGPUISD::MULHI_U24;
4326 } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
4327 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4328 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4329 LoOpcode = AMDGPUISD::MUL_I24;
4330 HiOpcode = AMDGPUISD::MULHI_I24;
4331 } else {
4332 return SDValue();
4333 }
4334
4335 SDValue Lo = DAG.getNode(LoOpcode, DL, MVT::i32, N0, N1);
4336 SDValue Hi = DAG.getNode(HiOpcode, DL, MVT::i32, N0, N1);
4337 DCI.CombineTo(N, Lo, Hi);
4338 return SDValue(N, 0);
4339}
4340
4342 DAGCombinerInfo &DCI) const {
4343 EVT VT = N->getValueType(0);
4344
4345 if (!Subtarget->hasMulI24() || VT.isVector())
4346 return SDValue();
4347
4348 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4349 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4350 // unnecessarily). isDivergent() is used as an approximation of whether the
4351 // value is in an SGPR.
4352 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
4353 // valu op anyway)
4354 if (Subtarget->hasSMulHi() && !N->isDivergent())
4355 return SDValue();
4356
4357 SelectionDAG &DAG = DCI.DAG;
4358 SDLoc DL(N);
4359
4360 SDValue N0 = N->getOperand(0);
4361 SDValue N1 = N->getOperand(1);
4362
4363 if (!isI24(N0, DAG) || !isI24(N1, DAG))
4364 return SDValue();
4365
4366 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4367 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4368
4369 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1);
4370 DCI.AddToWorklist(Mulhi.getNode());
4371 return DAG.getSExtOrTrunc(Mulhi, DL, VT);
4372}
4373
4375 DAGCombinerInfo &DCI) const {
4376 EVT VT = N->getValueType(0);
4377
4378 if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32)
4379 return SDValue();
4380
4381 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4382 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4383 // unnecessarily). isDivergent() is used as an approximation of whether the
4384 // value is in an SGPR.
4385 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
4386 // valu op anyway)
4387 if (Subtarget->hasSMulHi() && !N->isDivergent())
4388 return SDValue();
4389
4390 SelectionDAG &DAG = DCI.DAG;
4391 SDLoc DL(N);
4392
4393 SDValue N0 = N->getOperand(0);
4394 SDValue N1 = N->getOperand(1);
4395
4396 if (!isU24(N0, DAG) || !isU24(N1, DAG))
4397 return SDValue();
4398
4399 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4400 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4401
4402 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1);
4403 DCI.AddToWorklist(Mulhi.getNode());
4404 return DAG.getZExtOrTrunc(Mulhi, DL, VT);
4405}
4406
4407SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG,
4408 SDValue Op,
4409 const SDLoc &DL,
4410 unsigned Opc) const {
4411 EVT VT = Op.getValueType();
4412 EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT);
4413 if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() &&
4414 LegalVT != MVT::i16))
4415 return SDValue();
4416
4417 if (VT != MVT::i32)
4418 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op);
4419
4420 SDValue FFBX = DAG.getNode(Opc, DL, MVT::i32, Op);
4421 if (VT != MVT::i32)
4422 FFBX = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBX);
4423
4424 return FFBX;
4425}
4426
4427// The native instructions return -1 on 0 input. Optimize out a select that
4428// produces -1 on 0.
4429//
4430// TODO: If zero is not undef, we could also do this if the output is compared
4431// against the bitwidth.
4432//
4433// TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
4435 SDValue LHS, SDValue RHS,
4436 DAGCombinerInfo &DCI) const {
4437 if (!isNullConstant(Cond.getOperand(1)))
4438 return SDValue();
4439
4440 SelectionDAG &DAG = DCI.DAG;
4441 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
4442 SDValue CmpLHS = Cond.getOperand(0);
4443
4444 // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
4445 // select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x
4446 if (CCOpcode == ISD::SETEQ &&
4447 (isCtlzOpc(RHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&
4448 RHS.getOperand(0) == CmpLHS && isAllOnesConstant(LHS)) {
4449 unsigned Opc =
4451 return getFFBX_U32(DAG, CmpLHS, SL, Opc);
4452 }
4453
4454 // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
4455 // select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x
4456 if (CCOpcode == ISD::SETNE &&
4457 (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(LHS.getOpcode())) &&
4458 LHS.getOperand(0) == CmpLHS && isAllOnesConstant(RHS)) {
4459 unsigned Opc =
4461
4462 return getFFBX_U32(DAG, CmpLHS, SL, Opc);
4463 }
4464
4465 return SDValue();
4466}
4467
4469 unsigned Op,
4470 const SDLoc &SL,
4471 SDValue Cond,
4472 SDValue N1,
4473 SDValue N2) {
4474 SelectionDAG &DAG = DCI.DAG;
4475 EVT VT = N1.getValueType();
4476
4477 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond,
4478 N1.getOperand(0), N2.getOperand(0));
4479 DCI.AddToWorklist(NewSelect.getNode());
4480 return DAG.getNode(Op, SL, VT, NewSelect);
4481}
4482
4483// Pull a free FP operation out of a select so it may fold into uses.
4484//
4485// select c, (fneg x), (fneg y) -> fneg (select c, x, y)
4486// select c, (fneg x), k -> fneg (select c, x, (fneg k))
4487//
4488// select c, (fabs x), (fabs y) -> fabs (select c, x, y)
4489// select c, (fabs x), +k -> fabs (select c, x, k)
4490SDValue
4492 SDValue N) const {
4493 SelectionDAG &DAG = DCI.DAG;
4494 SDValue Cond = N.getOperand(0);
4495 SDValue LHS = N.getOperand(1);
4496 SDValue RHS = N.getOperand(2);
4497
4498 EVT VT = N.getValueType();
4499 if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||
4500 (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
4502 return SDValue();
4503
4504 return distributeOpThroughSelect(DCI, LHS.getOpcode(),
4505 SDLoc(N), Cond, LHS, RHS);
4506 }
4507
4508 bool Inv = false;
4509 if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) {
4510 std::swap(LHS, RHS);
4511 Inv = true;
4512 }
4513
4514 // TODO: Support vector constants.
4515 ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
4516 if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS &&
4517 !selectSupportsSourceMods(N.getNode())) {
4518 SDLoc SL(N);
4519 // If one side is an fneg/fabs and the other is a constant, we can push the
4520 // fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
4521 SDValue NewLHS = LHS.getOperand(0);
4522 SDValue NewRHS = RHS;
4523
4524 // Careful: if the neg can be folded up, don't try to pull it back down.
4525 bool ShouldFoldNeg = true;
4526
4527 if (NewLHS.hasOneUse()) {
4528 unsigned Opc = NewLHS.getOpcode();
4529 if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(NewLHS.getNode()))
4530 ShouldFoldNeg = false;
4531 if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
4532 ShouldFoldNeg = false;
4533 }
4534
4535 if (ShouldFoldNeg) {
4536 if (LHS.getOpcode() == ISD::FABS && CRHS->isNegative())
4537 return SDValue();
4538
4539 // We're going to be forced to use a source modifier anyway, there's no
4540 // point to pulling the negate out unless we can get a size reduction by
4541 // negating the constant.
4542 //
4543 // TODO: Generalize to use getCheaperNegatedExpression which doesn't know
4544 // about cheaper constants.
4545 if (NewLHS.getOpcode() == ISD::FABS &&
4547 return SDValue();
4548
4550 return SDValue();
4551
4552 if (LHS.getOpcode() == ISD::FNEG)
4553 NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4554
4555 if (Inv)
4556 std::swap(NewLHS, NewRHS);
4557
4558 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
4559 Cond, NewLHS, NewRHS);
4560 DCI.AddToWorklist(NewSelect.getNode());
4561 return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
4562 }
4563 }
4564
4565 return SDValue();
4566}
4567
4569 DAGCombinerInfo &DCI) const {
4570 if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
4571 return Folded;
4572
4573 SDValue Cond = N->getOperand(0);
4574 if (Cond.getOpcode() != ISD::SETCC)
4575 return SDValue();
4576
4577 EVT VT = N->getValueType(0);
4578 SDValue LHS = Cond.getOperand(0);
4579 SDValue RHS = Cond.getOperand(1);
4580 SDValue CC = Cond.getOperand(2);
4581
4582 SDValue True = N->getOperand(1);
4583 SDValue False = N->getOperand(2);
4584
4585 if (Cond.hasOneUse()) { // TODO: Look for multiple select uses.
4586 SelectionDAG &DAG = DCI.DAG;
4587 if (DAG.isConstantValueOfAnyType(True) &&
4588 !DAG.isConstantValueOfAnyType(False)) {
4589 // Swap cmp + select pair to move constant to false input.
4590 // This will allow using VOPC cndmasks more often.
4591 // select (setcc x, y), k, x -> select (setccinv x, y), x, k
4592
4593 SDLoc SL(N);
4594 ISD::CondCode NewCC =
4595 getSetCCInverse(cast<CondCodeSDNode>(CC)->get(), LHS.getValueType());
4596
4597 SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC);
4598 return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True);
4599 }
4600
4601 if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) {
4603 = combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
4604 // Revisit this node so we can catch min3/max3/med3 patterns.
4605 //DCI.AddToWorklist(MinMax.getNode());
4606 return MinMax;
4607 }
4608 }
4609
4610 // There's no reason to not do this if the condition has other uses.
4611 return performCtlz_CttzCombine(SDLoc(N), Cond, True, False, DCI);
4612}
4613
4614static bool isInv2Pi(const APFloat &APF) {
4615 static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118));
4616 static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983));
4617 static const APFloat KF64(APFloat::IEEEdouble(), APInt(64, 0x3fc45f306dc9c882));
4618
4619 return APF.bitwiseIsEqual(KF16) ||
4620 APF.bitwiseIsEqual(KF32) ||
4621 APF.bitwiseIsEqual(KF64);
4622}
4623
4624// 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an
4625// additional cost to negate them.
4628 if (C->isZero())
4629 return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
4630
4631 if (Subtarget->hasInv2PiInlineImm() && isInv2Pi(C->getValueAPF()))
4632 return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
4633
4635}
4636
4640 return false;
4641}
4642
4646 return false;
4647}
4648
4649static unsigned inverseMinMax(unsigned Opc) {
4650 switch (Opc) {
4651 case ISD::FMAXNUM:
4652 return ISD::FMINNUM;
4653 case ISD::FMINNUM:
4654 return ISD::FMAXNUM;
4655 case ISD::FMAXNUM_IEEE:
4656 return ISD::FMINNUM_IEEE;
4657 case ISD::FMINNUM_IEEE:
4658 return ISD::FMAXNUM_IEEE;
4659 case ISD::FMAXIMUM:
4660 return ISD::FMINIMUM;
4661 case ISD::FMINIMUM:
4662 return ISD::FMAXIMUM;
4667 default:
4668 llvm_unreachable("invalid min/max opcode");
4669 }
4670}
4671
4672/// \return true if it's profitable to try to push an fneg into its source
4673/// instruction.
4675 // If the input has multiple uses and we can either fold the negate down, or
4676 // the other uses cannot, give up. This both prevents unprofitable
4677 // transformations and infinite loops: we won't repeatedly try to fold around
4678 // a negate that has no 'good' form.
4679 if (N0.hasOneUse()) {
4680 // This may be able to fold into the source, but at a code size cost. Don't
4681 // fold if the fold into the user is free.
4682 if (allUsesHaveSourceMods(N, 0))
4683 return false;
4684 } else {
4685 if (fnegFoldsIntoOp(N0.getNode()) &&
4687 return false;
4688 }
4689
4690 return true;
4691}
4692
4694 DAGCombinerInfo &DCI) const {
4695 SelectionDAG &DAG = DCI.DAG;
4696 SDValue N0 = N->getOperand(0);
4697 EVT VT = N->getValueType(0);
4698
4699 unsigned Opc = N0.getOpcode();
4700
4701 if (!shouldFoldFNegIntoSrc(N, N0))
4702 return SDValue();
4703
4704 SDLoc SL(N);
4705 switch (Opc) {
4706 case ISD::FADD: {
4707 if (!mayIgnoreSignedZero(N0))
4708 return SDValue();
4709
4710 // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
4711 SDValue LHS = N0.getOperand(0);
4712 SDValue RHS = N0.getOperand(1);
4713
4714 if (LHS.getOpcode() != ISD::FNEG)
4715 LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
4716 else
4717 LHS = LHS.getOperand(0);
4718
4719 if (RHS.getOpcode() != ISD::FNEG)
4720 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4721 else
4722 RHS = RHS.getOperand(0);
4723
4724 SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS, N0->getFlags());
4725 if (Res.getOpcode() != ISD::FADD)
4726 return SDValue(); // Op got folded away.
4727 if (!N0.hasOneUse())
4728 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
4729 return Res;
4730 }
4731 case ISD::FMUL:
4733 // (fneg (fmul x, y)) -> (fmul x, (fneg y))
4734 // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y))
4735 SDValue LHS = N0.getOperand(0);
4736 SDValue RHS = N0.getOperand(1);
4737
4738 if (LHS.getOpcode() == ISD::FNEG)
4739 LHS = LHS.getOperand(0);
4740 else if (RHS.getOpcode() == ISD::FNEG)
4741 RHS = RHS.getOperand(0);
4742 else
4743 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4744
4745 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS, N0->getFlags());
4746 if (Res.getOpcode() != Opc)
4747 return SDValue(); // Op got folded away.
4748 if (!N0.hasOneUse())
4749 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
4750 return Res;
4751 }
4752 case ISD::FMA:
4753 case ISD::FMAD: {
4754 // TODO: handle llvm.amdgcn.fma.legacy
4755 if (!mayIgnoreSignedZero(N0))
4756 return SDValue();
4757
4758 // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))
4759 SDValue LHS = N0.getOperand(0);
4760 SDValue MHS = N0.getOperand(1);
4761 SDValue RHS = N0.getOperand(2);
4762
4763 if (LHS.getOpcode() == ISD::FNEG)
4764 LHS = LHS.getOperand(0);
4765 else if (MHS.getOpcode() == ISD::FNEG)
4766 MHS = MHS.getOperand(0);
4767 else
4768 MHS = DAG.getNode(ISD::FNEG, SL, VT, MHS);
4769
4770 if (RHS.getOpcode() != ISD::FNEG)
4771 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4772 else
4773 RHS = RHS.getOperand(0);
4774
4775 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS);
4776 if (Res.getOpcode() != Opc)
4777 return SDValue(); // Op got folded away.
4778 if (!N0.hasOneUse())
4779 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
4780 return Res;
4781 }
4782 case ISD::FMAXNUM:
4783 case ISD::FMINNUM:
4784 case ISD::FMAXNUM_IEEE:
4785 case ISD::FMINNUM_IEEE:
4786 case ISD::FMINIMUM:
4787 case ISD::FMAXIMUM:
4790 // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)
4791 // fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y)
4792 // fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y)
4793 // fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y)
4794
4795 SDValue LHS = N0.getOperand(0);
4796 SDValue RHS = N0.getOperand(1);
4797
4798 // 0 doesn't have a negated inline immediate.
4799 // TODO: This constant check should be generalized to other operations.
4801 return SDValue();
4802
4803 SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
4804 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4805 unsigned Opposite = inverseMinMax(Opc);
4806
4807 SDValue Res = DAG.getNode(Opposite, SL, VT, NegLHS, NegRHS, N0->getFlags());
4808 if (Res.getOpcode() != Opposite)
4809 return SDValue(); // Op got folded away.
4810 if (!N0.hasOneUse())
4811 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
4812 return Res;
4813 }
4814 case AMDGPUISD::FMED3: {
4815 SDValue Ops[3];
4816 for (unsigned I = 0; I < 3; ++I)
4817 Ops[I] = DAG.getNode(ISD::FNEG, SL, VT, N0->getOperand(I), N0->getFlags());
4818
4819 SDValue Res = DAG.getNode(AMDGPUISD::FMED3, SL, VT, Ops, N0->getFlags());
4820 if (Res.getOpcode() != AMDGPUISD::FMED3)
4821 return SDValue(); // Op got folded away.
4822
4823 if (!N0.hasOneUse()) {
4824 SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Res);
4825 DAG.ReplaceAllUsesWith(N0, Neg);
4826
4827 for (SDNode *U : Neg->uses())
4828 DCI.AddToWorklist(U);
4829 }
4830
4831 return Res;
4832 }
4833 case ISD::FP_EXTEND:
4834 case ISD::FTRUNC:
4835 case ISD::FRINT:
4836 case ISD::FNEARBYINT: // XXX - Should fround be handled?
4837 case ISD::FROUNDEVEN:
4838 case ISD::FSIN:
4839 case ISD::FCANONICALIZE:
4840 case AMDGPUISD::RCP:
4843 case AMDGPUISD::SIN_HW: {
4844 SDValue CvtSrc = N0.getOperand(0);
4845 if (CvtSrc.getOpcode() == ISD::FNEG) {
4846 // (fneg (fp_extend (fneg x))) -> (fp_extend x)
4847 // (fneg (rcp (fneg x))) -> (rcp x)
4848 return DAG.getNode(Opc, SL, VT, CvtSrc.getOperand(0));
4849 }
4850
4851 if (!N0.hasOneUse())
4852 return SDValue();
4853
4854 // (fneg (fp_extend x)) -> (fp_extend (fneg x))
4855 // (fneg (rcp x)) -> (rcp (fneg x))
4856 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
4857 return DAG.getNode(Opc, SL, VT, Neg, N0->getFlags());
4858 }
4859 case ISD::FP_ROUND: {
4860 SDValue CvtSrc = N0.getOperand(0);
4861
4862 if (CvtSrc.getOpcode() == ISD::FNEG) {
4863 // (fneg (fp_round (fneg x))) -> (fp_round x)
4864 return DAG.getNode(ISD::FP_ROUND, SL, VT,
4865 CvtSrc.getOperand(0), N0.getOperand(1));
4866 }
4867
4868 if (!N0.hasOneUse())
4869 return SDValue();
4870
4871 // (fneg (fp_round x)) -> (fp_round (fneg x))
4872 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
4873 return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1));
4874 }
4875 case ISD::FP16_TO_FP: {
4876 // v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal
4877 // f16, but legalization of f16 fneg ends up pulling it out of the source.
4878 // Put the fneg back as a legal source operation that can be matched later.
4879 SDLoc SL(N);
4880
4881 SDValue Src = N0.getOperand(0);
4882 EVT SrcVT = Src.getValueType();
4883
4884 // fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000)
4885 SDValue IntFNeg = DAG.getNode(ISD::XOR, SL, SrcVT, Src,
4886 DAG.getConstant(0x8000, SL, SrcVT));
4887 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFNeg);
4888 }
4889 case ISD::SELECT: {
4890 // fneg (select c, a, b) -> select c, (fneg a), (fneg b)
4891 // TODO: Invert conditions of foldFreeOpFromSelect
4892 return SDValue();
4893 }
4894 case ISD::BITCAST: {
4895 SDLoc SL(N);
4896 SDValue BCSrc = N0.getOperand(0);
4897 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
4898 SDValue HighBits = BCSrc.getOperand(BCSrc.getNumOperands() - 1);
4899 if (HighBits.getValueType().getSizeInBits() != 32 ||
4900 !fnegFoldsIntoOp(HighBits.getNode()))
4901 return SDValue();
4902
4903 // f64 fneg only really needs to operate on the high half of of the
4904 // register, so try to force it to an f32 operation to help make use of
4905 // source modifiers.
4906 //
4907 //
4908 // fneg (f64 (bitcast (build_vector x, y))) ->
4909 // f64 (bitcast (build_vector (bitcast i32:x to f32),
4910 // (fneg (bitcast i32:y to f32)))
4911
4912 SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::f32, HighBits);
4913 SDValue NegHi = DAG.getNode(ISD::FNEG, SL, MVT::f32, CastHi);
4914 SDValue CastBack =
4915 DAG.getNode(ISD::BITCAST, SL, HighBits.getValueType(), NegHi);
4916
4917 SmallVector<SDValue, 8> Ops(BCSrc->op_begin(), BCSrc->op_end());
4918 Ops.back() = CastBack;
4919 DCI.AddToWorklist(NegHi.getNode());
4920 SDValue Build =
4921 DAG.getNode(ISD::BUILD_VECTOR, SL, BCSrc.getValueType(), Ops);
4922 SDValue Result = DAG.getNode(ISD::BITCAST, SL, VT, Build);
4923
4924 if (!N0.hasOneUse())
4925 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Result));
4926 return Result;
4927 }
4928
4929 if (BCSrc.getOpcode() == ISD::SELECT && VT == MVT::f32 &&
4930 BCSrc.hasOneUse()) {
4931 // fneg (bitcast (f32 (select cond, i32:lhs, i32:rhs))) ->
4932 // select cond, (bitcast i32:lhs to f32), (bitcast i32:rhs to f32)
4933
4934 // TODO: Cast back result for multiple uses is beneficial in some cases.
4935
4936 SDValue LHS =
4937 DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(1));
4938 SDValue RHS =
4939 DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(2));
4940
4941 SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, LHS);
4942 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHS);
4943
4944 return DAG.getNode(ISD::SELECT, SL, MVT::f32, BCSrc.getOperand(0), NegLHS,
4945 NegRHS);
4946 }
4947
4948 return SDValue();
4949 }
4950 default:
4951 return SDValue();
4952 }
4953}
4954
4956 DAGCombinerInfo &DCI) const {
4957 SelectionDAG &DAG = DCI.DAG;
4958 SDValue N0 = N->getOperand(0);
4959
4960 if (!N0.hasOneUse())
4961 return SDValue();
4962
4963 switch (N0.getOpcode()) {
4964 case ISD::FP16_TO_FP: {
4965 assert(!Subtarget->has16BitInsts() && "should only see if f16 is illegal");
4966 SDLoc SL(N);
4967 SDValue Src = N0.getOperand(0);
4968 EVT SrcVT = Src.getValueType();
4969
4970 // fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff)
4971 SDValue IntFAbs = DAG.getNode(ISD::AND, SL, SrcVT, Src,
4972 DAG.getConstant(0x7fff, SL, SrcVT));
4973 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFAbs);
4974 }
4975 default:
4976 return SDValue();
4977 }
4978}
4979
4981 DAGCombinerInfo &DCI) const {
4982 const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
4983 if (!CFP)
4984 return SDValue();
4985
4986 // XXX - Should this flush denormals?
4987 const APFloat &Val = CFP->getValueAPF();
4988 APFloat One(Val.getSemantics(), "1.0");
4989 return DCI.DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0));
4990}
4991
4993 DAGCombinerInfo &DCI) const {
4994 SelectionDAG &DAG = DCI.DAG;
4995 SDLoc DL(N);
4996
4997 switch(N->getOpcode()) {
4998 default:
4999 break;
5000 case ISD::BITCAST: {
5001 EVT DestVT = N->getValueType(0);
5002
5003 // Push casts through vector builds. This helps avoid emitting a large
5004 // number of copies when materializing floating point vector constants.
5005 //
5006 // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) =>
5007 // vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y))
5008 if (DestVT.isVector()) {
5009 SDValue Src = N->getOperand(0);
5010 if (Src.getOpcode() == ISD::BUILD_VECTOR &&
5013 EVT SrcVT = Src.getValueType();
5014 unsigned NElts = DestVT.getVectorNumElements();
5015
5016 if (SrcVT.getVectorNumElements() == NElts) {
5017 EVT DestEltVT = DestVT.getVectorElementType();
5018
5019 SmallVector<SDValue, 8> CastedElts;
5020 SDLoc SL(N);
5021 for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) {
5022 SDValue Elt = Src.getOperand(I);
5023 CastedElts.push_back(DAG.getNode(ISD::BITCAST, DL, DestEltVT, Elt));
5024 }
5025
5026 return DAG.getBuildVector(DestVT, SL, CastedElts);
5027 }
5028 }
5029 }
5030
5031 if (DestVT.getSizeInBits() != 64 || !DestVT.isVector())
5032 break;
5033
5034 // Fold bitcasts of constants.
5035 //
5036 // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k)
5037 // TODO: Generalize and move to DAGCombiner
5038 SDValue Src = N->getOperand(0);
5039 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src)) {
5040 SDLoc SL(N);
5041 uint64_t CVal = C->getZExtValue();
5042 SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
5043 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
5044 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
5045 return DAG.getNode(ISD::BITCAST, SL, DestVT, BV);
5046 }
5047
5048 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Src)) {
5049 const APInt &Val = C->getValueAPF().bitcastToAPInt();
5050 SDLoc SL(N);
5051 uint64_t CVal = Val.getZExtValue();
5052 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
5053 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
5054 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
5055
5056 return DAG.getNode(ISD::BITCAST, SL, DestVT, Vec);
5057 }
5058
5059 break;
5060 }
5061 case ISD::SHL: {
5063 break;
5064
5065 return performShlCombine(N, DCI);
5066 }
5067 case ISD::SRL: {
5069 break;
5070
5071 return performSrlCombine(N, DCI);
5072 }
5073 case ISD::SRA: {
5075 break;
5076
5077 return performSraCombine(N, DCI);
5078 }
5079 case ISD::TRUNCATE:
5080 return performTruncateCombine(N, DCI);
5081 case ISD::MUL:
5082 return performMulCombine(N, DCI);
5083 case AMDGPUISD::MUL_U24:
5084 case AMDGPUISD::MUL_I24: {
5085 if (SDValue Simplified = simplifyMul24(N, DCI))
5086 return Simplified;
5087 break;
5088 }
5091 return simplifyMul24(N, DCI);
5092 case ISD::SMUL_LOHI:
5093 case ISD::UMUL_LOHI:
5094 return performMulLoHiCombine(N, DCI);
5095 case ISD::MULHS:
5096 return performMulhsCombine(N, DCI);
5097 case ISD::MULHU:
5098 return performMulhuCombine(N, DCI);
5099 case ISD::SELECT:
5100 return performSelectCombine(N, DCI);
5101 case ISD::FNEG:
5102 return performFNegCombine(N, DCI);
5103 case ISD::FABS:
5104 return performFAbsCombine(N, DCI);
5105 case AMDGPUISD::BFE_I32:
5106 case AMDGPUISD::BFE_U32: {
5107 assert(!N->getValueType(0).isVector() &&
5108 "Vector handling of BFE not implemented");
5109 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
5110 if (!Width)
5111 break;
5112
5113 uint32_t WidthVal = Width->getZExtValue() & 0x1f;
5114 if (WidthVal == 0)
5115 return DAG.getConstant(0, DL, MVT::i32);
5116
5117 ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
5118 if (!Offset)
5119 break;
5120
5121 SDValue BitsFrom = N->getOperand(0);
5122 uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;
5123
5124 bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
5125
5126 if (OffsetVal == 0) {
5127 // This is already sign / zero extended, so try to fold away extra BFEs.
5128 unsigned SignBits = Signed ? (32 - WidthVal + 1) : (32 - WidthVal);
5129
5130 unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom);
5131 if (OpSignBits >= SignBits)
5132 return BitsFrom;
5133
5134 EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal);
5135 if (Signed) {
5136 // This is a sign_extend_inreg. Replace it to take advantage of existing
5137 // DAG Combines. If not eliminated, we will match back to BFE during
5138 // selection.
5139
5140 // TODO: The sext_inreg of extended types ends, although we can could
5141 // handle them in a single BFE.
5142 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom,
5143 DAG.getValueType(SmallVT));
5144 }
5145
5146 return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT);
5147 }
5148
5149 if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(BitsFrom)) {
5150 if (Signed) {
5151 return constantFoldBFE<int32_t>(DAG,
5152 CVal->getSExtValue(),
5153 OffsetVal,
5154 WidthVal,
5155 DL);
5156 }
5157
5158 return constantFoldBFE<uint32_t>(DAG,
5159 CVal->getZExtValue(),
5160 OffsetVal,
5161 WidthVal,
5162 DL);
5163 }
5164
5165 if ((OffsetVal + WidthVal) >= 32 &&
5166 !(Subtarget->hasSDWA() && OffsetVal == 16 && WidthVal == 16)) {
5167 SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32);
5168 return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
5169 BitsFrom, ShiftVal);
5170 }
5171
5172 if (BitsFrom.hasOneUse()) {
5173 APInt Demanded = APInt::getBitsSet(32,
5174 OffsetVal,
5175 OffsetVal + WidthVal);
5176
5177 KnownBits Known;
5179 !DCI.isBeforeLegalizeOps());
5180 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
5181 if (TLI.ShrinkDemandedConstant(BitsFrom, Demanded, TLO) ||
5182 TLI.SimplifyDemandedBits(BitsFrom, Demanded, Known, TLO)) {
5183 DCI.CommitTargetLoweringOpt(TLO);
5184 }
5185 }
5186
5187 break;
5188 }
5189 case ISD::LOAD:
5190 return performLoadCombine(N, DCI);
5191 case ISD::STORE:
5192 return performStoreCombine(N, DCI);
5193 case AMDGPUISD::RCP:
5195 return performRcpCombine(N, DCI);
5196 case ISD::AssertZext:
5197 case ISD::AssertSext:
5198 return performAssertSZExtCombine(N, DCI);
5200 return performIntrinsicWOChainCombine(N, DCI);
5201 case AMDGPUISD::FMAD_FTZ: {
5202 SDValue N0 = N->getOperand(0);
5203 SDValue N1 = N->getOperand(1);
5204 SDValue N2 = N->getOperand(2);
5205 EVT VT = N->getValueType(0);
5206
5207 // FMAD_FTZ is a FMAD + flush denormals to zero.
5208 // We flush the inputs, the intermediate step, and the output.
5209 ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
5210 ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
5211 ConstantFPSDNode *N2CFP = dyn_cast<ConstantFPSDNode>(N2);
5212 if (N0CFP && N1CFP && N2CFP) {
5213 const auto FTZ = [](const APFloat &V) {
5214 if (V.isDenormal()) {
5215 APFloat Zero(V.getSemantics(), 0);
5216 return V.isNegative() ? -Zero : Zero;
5217 }
5218 return V;
5219 };
5220
5221 APFloat V0 = FTZ(N0CFP->getValueAPF());
5222 APFloat V1 = FTZ(N1CFP->getValueAPF());
5223 APFloat V2 = FTZ(N2CFP->getValueAPF());
5225 V0 = FTZ(V0);
5227 return DAG.getConstantFP(FTZ(V0), DL, VT);
5228 }
5229 break;
5230 }
5231 }
5232 return SDValue();
5233}
5234
5235//===----------------------------------------------------------------------===//
5236// Helper functions
5237//===----------------------------------------------------------------------===//
5238
5240 const TargetRegisterClass *RC,
5241 Register Reg, EVT VT,
5242 const SDLoc &SL,
5243 bool RawReg) const {
5246 Register VReg;
5247
5248 if (!MRI.isLiveIn(Reg)) {
5249 VReg = MRI.createVirtualRegister(RC);
5250 MRI.addLiveIn(Reg, VReg);
5251 } else {
5252 VReg = MRI.getLiveInVirtReg(Reg);
5253 }
5254
5255 if (RawReg)
5256 return DAG.getRegister(VReg, VT);
5257
5258 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT);
5259}
5260
5261// This may be called multiple times, and nothing prevents creating multiple
5262// objects at the same offset. See if we already defined this object.
5264 int64_t Offset) {
5265 for (int I = MFI.getObjectIndexBegin(); I < 0; ++I) {
5266 if (MFI.getObjectOffset(I) == Offset) {
5267 assert(MFI.getObjectSize(I) == Size);
5268 return I;
5269 }
5270 }
5271
5272 return MFI.CreateFixedObject(Size, Offset, true);
5273}
5274
5276 EVT VT,
5277 const SDLoc &SL,
5278 int64_t Offset) const {
5280 MachineFrameInfo &MFI = MF.getFrameInfo();
5281 int FI = getOrCreateFixedStackObject(MFI, VT.getStoreSize(), Offset);
5282
5283 auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset);
5284 SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32);
5285
5286 return DAG.getLoad(VT, SL, DAG.getEntryNode(), Ptr, SrcPtrInfo, Align(4),
5289}
5290
5292 const SDLoc &SL,
5293 SDValue Chain,
5294 SDValue ArgVal,
5295 int64_t Offset) const {
5299
5300 SDValue Ptr = DAG.getConstant(Offset, SL, MVT::i32);
5301 // Stores to the argument stack area are relative to the stack pointer.
5302 SDValue SP =
5303 DAG.getCopyFromReg(Chain, SL, Info->getStackPtrOffsetReg(), MVT::i32);
5304 Ptr = DAG.getNode(ISD::ADD, SL, MVT::i32, SP, Ptr);
5305 SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, Align(4),
5307 return Store;
5308}
5309
5311 const TargetRegisterClass *RC,
5312 EVT VT, const SDLoc &SL,
5313 const ArgDescriptor &Arg) const {
5314 assert(Arg && "Attempting to load missing argument");
5315
5316 SDValue V = Arg.isRegister() ?
5317 CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL) :
5318 loadStackInputValue(DAG, VT, SL, Arg.getStackOffset());
5319
5320 if (!Arg.isMasked())
5321 return V;
5322
5323 unsigned Mask = Arg.getMask();
5324 unsigned Shift = llvm::countr_zero<unsigned>(Mask);
5325 V = DAG.getNode(ISD::SRL, SL, VT, V,
5326 DAG.getShiftAmountConstant(Shift, VT, SL));
5327 return DAG.getNode(ISD::AND, SL, VT, V,
5328 DAG.getConstant(Mask >> Shift, SL, VT));
5329}
5330
5332 uint64_t ExplicitKernArgSize, const ImplicitParameter Param) const {
5333 unsigned ExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
5334 const Align Alignment = Subtarget->getAlignmentForImplicitArgPtr();
5335 uint64_t ArgOffset =
5336 alignTo(ExplicitKernArgSize, Alignment) + ExplicitArgOffset;
5337 switch (Param) {
5338 case FIRST_IMPLICIT:
5339 return ArgOffset;
5340 case PRIVATE_BASE:
5342 case SHARED_BASE:
5343 return ArgOffset + AMDGPU::ImplicitArg::SHARED_BASE_OFFSET;
5344 case QUEUE_PTR:
5345 return ArgOffset + AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET;
5346 }
5347 llvm_unreachable("unexpected implicit parameter type");
5348}
5349
5351 const MachineFunction &MF, const ImplicitParameter Param) const {
5354}
5355
5356#define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node;
5357
5358const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
5359 switch ((AMDGPUISD::NodeType)Opcode) {
5360 case AMDGPUISD::FIRST_NUMBER: break;
5361 // AMDIL DAG nodes
5362 NODE_NAME_CASE(UMUL);
5363 NODE_NAME_CASE(BRANCH_COND);
5364
5365 // AMDGPU DAG nodes
5366 NODE_NAME_CASE(IF)
5367 NODE_NAME_CASE(ELSE)
5368 NODE_NAME_CASE(LOOP)
5369 NODE_NAME_CASE(CALL)
5370 NODE_NAME_CASE(TC_RETURN)
5371 NODE_NAME_CASE(TC_RETURN_GFX)
5372 NODE_NAME_CASE(TC_RETURN_CHAIN)
5373 NODE_NAME_CASE(TRAP)
5374 NODE_NAME_CASE(RET_GLUE)
5375 NODE_NAME_CASE(WAVE_ADDRESS)
5376 NODE_NAME_CASE(RETURN_TO_EPILOG)
5377 NODE_NAME_CASE(ENDPGM)
5378 NODE_NAME_CASE(ENDPGM_TRAP)
5379 NODE_NAME_CASE(DWORDADDR)
5380 NODE_NAME_CASE(FRACT)
5381 NODE_NAME_CASE(SETCC)
5382 NODE_NAME_CASE(SETREG)
5383 NODE_NAME_CASE(DENORM_MODE)
5384 NODE_NAME_CASE(FMA_W_CHAIN)
5385 NODE_NAME_CASE(FMUL_W_CHAIN)
5386 NODE_NAME_CASE(CLAMP)
5387 NODE_NAME_CASE(COS_HW)
5388 NODE_NAME_CASE(SIN_HW)
5389 NODE_NAME_CASE(FMAX_LEGACY)
5390 NODE_NAME_CASE(FMIN_LEGACY)
5391 NODE_NAME_CASE(FMAX3)
5392 NODE_NAME_CASE(SMAX3)
5393 NODE_NAME_CASE(UMAX3)
5394 NODE_NAME_CASE(FMIN3)
5395 NODE_NAME_CASE(SMIN3)
5396 NODE_NAME_CASE(UMIN3)
5397 NODE_NAME_CASE(FMED3)
5398 NODE_NAME_CASE(SMED3)
5399 NODE_NAME_CASE(UMED3)
5400 NODE_NAME_CASE(FMAXIMUM3)
5401 NODE_NAME_CASE(FMINIMUM3)
5402 NODE_NAME_CASE(FDOT2)
5403 NODE_NAME_CASE(URECIP)
5404 NODE_NAME_CASE(DIV_SCALE)
5405 NODE_NAME_CASE(DIV_FMAS)
5406 NODE_NAME_CASE(DIV_FIXUP)
5407 NODE_NAME_CASE(FMAD_FTZ)
5408 NODE_NAME_CASE(RCP)
5409 NODE_NAME_CASE(RSQ)
5410 NODE_NAME_CASE(RCP_LEGACY)
5411 NODE_NAME_CASE(RCP_IFLAG)
5412 NODE_NAME_CASE(LOG)
5413 NODE_NAME_CASE(EXP)
5414 NODE_NAME_CASE(FMUL_LEGACY)
5415 NODE_NAME_CASE(RSQ_CLAMP)
5416 NODE_NAME_CASE(FP_CLASS)
5417 NODE_NAME_CASE(DOT4)
5418 NODE_NAME_CASE(CARRY)
5419 NODE_NAME_CASE(BORROW)
5420 NODE_NAME_CASE(BFE_U32)
5421 NODE_NAME_CASE(BFE_I32)
5422 NODE_NAME_CASE(BFI)
5423 NODE_NAME_CASE(BFM)
5424 NODE_NAME_CASE(FFBH_U32)
5425 NODE_NAME_CASE(FFBH_I32)
5426 NODE_NAME_CASE(FFBL_B32)
5427 NODE_NAME_CASE(MUL_U24)
5428 NODE_NAME_CASE(MUL_I24)
5429 NODE_NAME_CASE(MULHI_U24)
5430 NODE_NAME_CASE(MULHI_I24)
5431 NODE_NAME_CASE(MAD_U24)
5432 NODE_NAME_CASE(MAD_I24)
5433 NODE_NAME_CASE(MAD_I64_I32)
5434 NODE_NAME_CASE(MAD_U64_U32)
5435 NODE_NAME_CASE(PERM)
5436 NODE_NAME_CASE(TEXTURE_FETCH)
5437 NODE_NAME_CASE(R600_EXPORT)
5438 NODE_NAME_CASE(CONST_ADDRESS)
5439 NODE_NAME_CASE(REGISTER_LOAD)
5440 NODE_NAME_CASE(REGISTER_STORE)
5441 NODE_NAME_CASE(SAMPLE)
5442 NODE_NAME_CASE(SAMPLEB)
5443 NODE_NAME_CASE(SAMPLED)
5444 NODE_NAME_CASE(SAMPLEL)
5445 NODE_NAME_CASE(CVT_F32_UBYTE0)
5446 NODE_NAME_CASE(CVT_F32_UBYTE1)
5447 NODE_NAME_CASE(CVT_F32_UBYTE2)
5448 NODE_NAME_CASE(CVT_F32_UBYTE3)
5449 NODE_NAME_CASE(CVT_PKRTZ_F16_F32)
5450 NODE_NAME_CASE(CVT_PKNORM_I16_F32)
5451 NODE_NAME_CASE(CVT_PKNORM_U16_F32)
5452 NODE_NAME_CASE(CVT_PK_I16_I32)
5453 NODE_NAME_CASE(CVT_PK_U16_U32)
5454 NODE_NAME_CASE(FP_TO_FP16)
5455 NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
5456 NODE_NAME_CASE(CONST_DATA_PTR)
5457 NODE_NAME_CASE(PC_ADD_REL_OFFSET)
5459 NODE_NAME_CASE(FPTRUNC_ROUND_UPWARD)
5460 NODE_NAME_CASE(FPTRUNC_ROUND_DOWNWARD)
5461 NODE_NAME_CASE(DUMMY_CHAIN)
5463 NODE_NAME_CASE(LOAD_D16_HI)
5464 NODE_NAME_CASE(LOAD_D16_LO)
5465 NODE_NAME_CASE(LOAD_D16_HI_I8)
5466 NODE_NAME_CASE(LOAD_D16_HI_U8)
5467 NODE_NAME_CASE(LOAD_D16_LO_I8)
5468 NODE_NAME_CASE(LOAD_D16_LO_U8)
5469 NODE_NAME_CASE(STORE_MSKOR)
5470 NODE_NAME_CASE(LOAD_CONSTANT)
5471 NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
5472 NODE_NAME_CASE(TBUFFER_STORE_FORMAT_D16)
5473 NODE_NAME_CASE(TBUFFER_LOAD_FORMAT)
5474 NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16)
5475 NODE_NAME_CASE(DS_ORDERED_COUNT)
5476 NODE_NAME_CASE(ATOMIC_CMP_SWAP)
5477 NODE_NAME_CASE(ATOMIC_LOAD_FMIN)
5478 NODE_NAME_CASE(ATOMIC_LOAD_FMAX)
5479 NODE_NAME_CASE(BUFFER_LOAD)
5480 NODE_NAME_CASE(BUFFER_LOAD_UBYTE)
5481 NODE_NAME_CASE(BUFFER_LOAD_USHORT)
5482 NODE_NAME_CASE(BUFFER_LOAD_BYTE)
5483 NODE_NAME_CASE(BUFFER_LOAD_SHORT)
5484 NODE_NAME_CASE(BUFFER_LOAD_FORMAT)
5485 NODE_NAME_CASE(BUFFER_LOAD_FORMAT_TFE)
5486 NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16)
5487 NODE_NAME_CASE(SBUFFER_LOAD)
5488 NODE_NAME_CASE(SBUFFER_LOAD_BYTE)
5489 NODE_NAME_CASE(SBUFFER_LOAD_UBYTE)
5490 NODE_NAME_CASE(SBUFFER_LOAD_SHORT)
5491 NODE_NAME_CASE(SBUFFER_LOAD_USHORT)
5492 NODE_NAME_CASE(BUFFER_STORE)
5493 NODE_NAME_CASE(BUFFER_STORE_BYTE)
5494 NODE_NAME_CASE(BUFFER_STORE_SHORT)
5495 NODE_NAME_CASE(BUFFER_STORE_FORMAT)
5496 NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16)
5497 NODE_NAME_CASE(BUFFER_ATOMIC_SWAP)
5498 NODE_NAME_CASE(BUFFER_ATOMIC_ADD)
5499 NODE_NAME_CASE(BUFFER_ATOMIC_SUB)
5500 NODE_NAME_CASE(BUFFER_ATOMIC_SMIN)
5501 NODE_NAME_CASE(BUFFER_ATOMIC_UMIN)
5502 NODE_NAME_CASE(BUFFER_ATOMIC_SMAX)
5503 NODE_NAME_CASE(BUFFER_ATOMIC_UMAX)
5504 NODE_NAME_CASE(BUFFER_ATOMIC_AND)
5505 NODE_NAME_CASE(BUFFER_ATOMIC_OR)
5506 NODE_NAME_CASE(BUFFER_ATOMIC_XOR)
5507 NODE_NAME_CASE(BUFFER_ATOMIC_INC)
5508 NODE_NAME_CASE(BUFFER_ATOMIC_DEC)
5509 NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP)
5510 NODE_NAME_CASE(BUFFER_ATOMIC_CSUB)
5511 NODE_NAME_CASE(BUFFER_ATOMIC_FADD)
5512 NODE_NAME_CASE(BUFFER_ATOMIC_FADD_BF16)
5513 NODE_NAME_CASE(BUFFER_ATOMIC_FMIN)
5514 NODE_NAME_CASE(BUFFER_ATOMIC_FMAX)
5515 NODE_NAME_CASE(BUFFER_ATOMIC_COND_SUB_U32)
5516
5518 }
5519 return nullptr;
5520}
5521
5523 SelectionDAG &DAG, int Enabled,
5524 int &RefinementSteps,
5525 bool &UseOneConstNR,
5526 bool Reciprocal) const {
5527 EVT VT = Operand.getValueType();
5528
5529 if (VT == MVT::f32) {
5530 RefinementSteps = 0;
5531 return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand);
5532 }
5533
5534 // TODO: There is also f64 rsq instruction, but the documentation is less
5535 // clear on its precision.
5536
5537 return SDValue();
5538}
5539
5541 SelectionDAG &DAG, int Enabled,
5542 int &RefinementSteps) const {
5543 EVT VT = Operand.getValueType();
5544
5545 if (VT == MVT::f32) {
5546 // Reciprocal, < 1 ulp error.
5547 //
5548 // This reciprocal approximation converges to < 0.5 ulp error with one
5549 // newton rhapson performed with two fused multiple adds (FMAs).
5550
5551 RefinementSteps = 0;
5552 return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand);
5553 }
5554
5555 // TODO: There is also f64 rcp instruction, but the documentation is less
5556 // clear on its precision.
5557
5558 return SDValue();
5559}
5560
5561static unsigned workitemIntrinsicDim(unsigned ID) {
5562 switch (ID) {
5563 case Intrinsic::amdgcn_workitem_id_x:
5564 return 0;
5565 case Intrinsic::amdgcn_workitem_id_y:
5566 return 1;
5567 case Intrinsic::amdgcn_workitem_id_z:
5568 return 2;
5569 default:
5570 llvm_unreachable("not a workitem intrinsic");
5571 }
5572}
5573
5575 const SDValue Op, KnownBits &Known,
5576 const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
5577
5578 Known.resetAll(); // Don't know anything.
5579
5580 unsigned Opc = Op.getOpcode();
5581
5582 switch (Opc) {
5583 default:
5584 break;
5585 case AMDGPUISD::CARRY:
5586 case AMDGPUISD::BORROW: {
5587 Known.Zero = APInt::getHighBitsSet(32, 31);
5588 break;
5589 }
5590
5591 case AMDGPUISD::BFE_I32:
5592 case AMDGPUISD::BFE_U32: {
5593 ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5594 if (!CWidth)
5595 return;
5596
5597 uint32_t Width = CWidth->getZExtValue() & 0x1f;
5598
5599 if (Opc == AMDGPUISD::BFE_U32)
5600 Known.Zero = APInt::getHighBitsSet(32, 32 - Width);
5601
5602 break;
5603 }
5604 case AMDGPUISD::FP_TO_FP16: {
5605 unsigned BitWidth = Known.getBitWidth();
5606
5607 // High bits are zero.
5609 break;
5610 }
5611 case AMDGPUISD::MUL_U24:
5612 case AMDGPUISD::MUL_I24: {
5613 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
5614 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
5615 unsigned TrailZ = LHSKnown.countMinTrailingZeros() +
5616 RHSKnown.countMinTrailingZeros();
5617 Known.Zero.setLowBits(std::min(TrailZ, 32u));
5618 // Skip extra check if all bits are known zeros.
5619 if (TrailZ >= 32)
5620 break;
5621
5622 // Truncate to 24 bits.
5623 LHSKnown = LHSKnown.trunc(24);
5624 RHSKnown = RHSKnown.trunc(24);
5625
5626 if (Opc == AMDGPUISD::MUL_I24) {
5627 unsigned LHSValBits = LHSKnown.countMaxSignificantBits();
5628 unsigned RHSValBits = RHSKnown.countMaxSignificantBits();
5629 unsigned MaxValBits = LHSValBits + RHSValBits;
5630 if (MaxValBits > 32)
5631 break;
5632 unsigned SignBits = 32 - MaxValBits + 1;
5633 bool LHSNegative = LHSKnown.isNegative();
5634 bool LHSNonNegative = LHSKnown.isNonNegative();
5635 bool LHSPositive = LHSKnown.isStrictlyPositive();
5636 bool RHSNegative = RHSKnown.isNegative();
5637 bool RHSNonNegative = RHSKnown.isNonNegative();
5638 bool RHSPositive = RHSKnown.isStrictlyPositive();
5639
5640 if ((LHSNonNegative && RHSNonNegative) || (LHSNegative && RHSNegative))
5641 Known.Zero.setHighBits(SignBits);
5642 else if ((LHSNegative && RHSPositive) || (LHSPositive && RHSNegative))
5643 Known.One.setHighBits(SignBits);
5644 } else {
5645 unsigned LHSValBits = LHSKnown.countMaxActiveBits();
5646 unsigned RHSValBits = RHSKnown.countMaxActiveBits();
5647 unsigned MaxValBits = LHSValBits + RHSValBits;
5648 if (MaxValBits >= 32)
5649 break;
5650 Known.Zero.setBitsFrom(MaxValBits);
5651 }
5652 break;
5653 }
5654 case AMDGPUISD::PERM: {
5655 ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5656 if (!CMask)
5657 return;
5658
5659 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
5660 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
5661 unsigned Sel = CMask->getZExtValue();
5662
5663 for (unsigned I = 0; I < 32; I += 8) {
5664 unsigned SelBits = Sel & 0xff;
5665 if (SelBits < 4) {
5666 SelBits *= 8;
5667 Known.One |= ((RHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
5668 Known.Zero |= ((RHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
5669 } else if (SelBits < 7) {
5670 SelBits = (SelBits & 3) * 8;
5671 Known.One |= ((LHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
5672 Known.Zero |= ((LHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
5673 } else if (SelBits == 0x0c) {
5674 Known.Zero |= 0xFFull << I;
5675 } else if (SelBits > 0x0c) {
5676 Known.One |= 0xFFull << I;
5677 }
5678 Sel >>= 8;
5679 }
5680 break;
5681 }
5683 Known.Zero.setHighBits(24);
5684 break;
5685 }
5687 Known.Zero.setHighBits(16);
5688 break;
5689 }
5690 case AMDGPUISD::LDS: {
5691 auto GA = cast<GlobalAddressSDNode>(Op.getOperand(0).getNode());
5692 Align Alignment = GA->getGlobal()->getPointerAlignment(DAG.getDataLayout());
5693
5694 Known.Zero.setHighBits(16);
5695 Known.Zero.setLowBits(Log2(Alignment));
5696 break;
5697 }
5698 case AMDGPUISD::SMIN3:
5699 case AMDGPUISD::SMAX3:
5700 case AMDGPUISD::SMED3:
5701 case AMDGPUISD::UMIN3:
5702 case AMDGPUISD::UMAX3:
5703 case AMDGPUISD::UMED3: {
5704 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
5705 if (Known2.isUnknown())
5706 break;
5707
5708 KnownBits Known1 = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
5709 if (Known1.isUnknown())
5710 break;
5711
5712 KnownBits Known0 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
5713 if (Known0.isUnknown())
5714 break;
5715
5716 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
5717 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
5718 Known.One = Known0.One & Known1.One & Known2.One;
5719 break;
5720 }
5722 unsigned IID = Op.getConstantOperandVal(0);
5723 switch (IID) {
5724 case Intrinsic::amdgcn_workitem_id_x:
5725 case Intrinsic::amdgcn_workitem_id_y:
5726 case Intrinsic::amdgcn_workitem_id_z: {
5727 unsigned MaxValue = Subtarget->getMaxWorkitemID(
5729 Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
5730 break;
5731 }
5732 default:
5733 break;
5734 }
5735 }
5736 }
5737}
5738
5740 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
5741 unsigned Depth) const {
5742 switch (Op.getOpcode()) {
5743 case AMDGPUISD::BFE_I32: {
5744 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5745 if (!Width)
5746 return 1;
5747
5748 unsigned SignBits = 32 - Width->getZExtValue() + 1;
5749 if (!isNullConstant(Op.getOperand(1)))
5750 return SignBits;
5751
5752 // TODO: Could probably figure something out with non-0 offsets.
5753 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
5754 return std::max(SignBits, Op0SignBits);
5755 }
5756
5757 case AMDGPUISD::BFE_U32: {
5758 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5759 return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1;
5760 }
5761
5762 case AMDGPUISD::CARRY:
5763 case AMDGPUISD::BORROW:
5764 return 31;
5766 return 25;
5768 return 17;
5770 return 24;
5772 return 16;
5774 return 16;
5775 case AMDGPUISD::SMIN3:
5776 case AMDGPUISD::SMAX3:
5777 case AMDGPUISD::SMED3:
5778 case AMDGPUISD::UMIN3:
5779 case AMDGPUISD::UMAX3:
5780 case AMDGPUISD::UMED3: {
5781 unsigned Tmp2 = DAG.ComputeNumSignBits(Op.getOperand(2), Depth + 1);
5782 if (Tmp2 == 1)
5783 return 1; // Early out.
5784
5785 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth + 1);
5786 if (Tmp1 == 1)
5787 return 1; // Early out.
5788
5789 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
5790 if (Tmp0 == 1)
5791 return 1; // Early out.
5792
5793 return std::min(Tmp0, std::min(Tmp1, Tmp2));
5794 }
5795 default:
5796 return 1;
5797 }
5798}
5799
5802 const APInt &DemandedElts, const MachineRegisterInfo &MRI,
5803 unsigned Depth) const {
5804 const MachineInstr *MI = MRI.getVRegDef(R);
5805 if (!MI)
5806 return 1;
5807
5808 // TODO: Check range metadata on MMO.
5809 switch (MI->getOpcode()) {
5810 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
5811 return 25;
5812 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
5813 return 17;
5814 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
5815 return 24;
5816 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
5817 return 16;
5818 case AMDGPU::G_AMDGPU_SMED3:
5819 case AMDGPU::G_AMDGPU_UMED3: {
5820 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
5821 unsigned Tmp2 = Analysis.computeNumSignBits(Src2, DemandedElts, Depth + 1);
5822 if (Tmp2 == 1)
5823 return 1;
5824 unsigned Tmp1 = Analysis.computeNumSignBits(Src1, DemandedElts, Depth + 1);
5825 if (Tmp1 == 1)
5826 return 1;
5827 unsigned Tmp0 = Analysis.computeNumSignBits(Src0, DemandedElts, Depth + 1);
5828 if (Tmp0 == 1)
5829 return 1;
5830 return std::min(Tmp0, std::min(Tmp1, Tmp2));
5831 }
5832 default:
5833 return 1;
5834 }
5835}
5836
5838 const SelectionDAG &DAG,
5839 bool SNaN,
5840 unsigned Depth) const {
5841 unsigned Opcode = Op.getOpcode();
5842 switch (Opcode) {
5845 if (SNaN)
5846 return true;
5847
5848 // TODO: Can check no nans on one of the operands for each one, but which
5849 // one?
5850 return false;
5851 }
5854 if (SNaN)
5855 return true;
5856 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
5857 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
5858 }
5859 case AMDGPUISD::FMED3:
5860 case AMDGPUISD::FMIN3:
5861 case AMDGPUISD::FMAX3:
5864 case AMDGPUISD::FMAD_FTZ: {
5865 if (SNaN)
5866 return true;
5867 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
5868 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
5869 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
5870 }
5875 return true;
5876
5877 case AMDGPUISD::RCP:
5878 case AMDGPUISD::RSQ:
5880 case AMDGPUISD::RSQ_CLAMP: {
5881 if (SNaN)
5882 return true;
5883
5884 // TODO: Need is known positive check.
5885 return false;
5886 }
5887 case ISD::FLDEXP:
5888 case AMDGPUISD::FRACT: {
5889 if (SNaN)
5890 return true;
5891 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
5892 }
5896 // TODO: Refine on operands.
5897 return SNaN;
5898 case AMDGPUISD::SIN_HW:
5899 case AMDGPUISD::COS_HW: {
5900 // TODO: Need check for infinity
5901 return SNaN;
5902 }
5904 unsigned IntrinsicID = Op.getConstantOperandVal(0);
5905 // TODO: Handle more intrinsics
5906 switch (IntrinsicID) {
5907 case Intrinsic::amdgcn_cubeid:
5908 return true;
5909
5910 case Intrinsic::amdgcn_frexp_mant: {
5911 if (SNaN)
5912 return true;
5913 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
5914 }
5915 case Intrinsic::amdgcn_cvt_pkrtz: {
5916 if (SNaN)
5917 return true;
5918 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
5919 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
5920 }
5921 case Intrinsic::amdgcn_rcp:
5922 case Intrinsic::amdgcn_rsq:
5923 case Intrinsic::amdgcn_rcp_legacy:
5924 case Intrinsic::amdgcn_rsq_legacy:
5925 case Intrinsic::amdgcn_rsq_clamp: {
5926 if (SNaN)
5927 return true;
5928
5929 // TODO: Need is known positive check.
5930 return false;
5931 }
5932 case Intrinsic::amdgcn_trig_preop:
5933 case Intrinsic::amdgcn_fdot2:
5934 // TODO: Refine on operand
5935 return SNaN;
5936 case Intrinsic::amdgcn_fma_legacy:
5937 if (SNaN)
5938 return true;
5939 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
5940 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1) &&
5941 DAG.isKnownNeverNaN(Op.getOperand(3), SNaN, Depth + 1);
5942 default:
5943 return false;
5944 }
5945 }
5946 default:
5947 return false;
5948 }
5949}
5950
5952 Register N0, Register N1) const {
5953 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
5954}
5955
5958 switch (RMW->getOperation()) {
5965 default: {
5966 if (auto *IntTy = dyn_cast<IntegerType>(RMW->getType())) {
5967 unsigned Size = IntTy->getBitWidth();
5968 if (Size == 32 || Size == 64)
5970 }
5971
5973 }
5974 }
5975}
5976
5977/// Whether it is profitable to sink the operands of an
5978/// Instruction I to the basic block of I.
5979/// This helps using several modifiers (like abs and neg) more often.
5981 Instruction *I, SmallVectorImpl<Use *> &Ops) const {
5982 using namespace PatternMatch;
5983
5984 for (auto &Op : I->operands()) {
5985 // Ensure we are not already sinking this operand.
5986 if (any_of(Ops, [&](Use *U) { return U->get() == Op.get(); }))
5987 continue;
5988
5989 if (match(&Op, m_FAbs(m_Value())) || match(&Op, m_FNeg(m_Value())))
5990 Ops.push_back(&Op);
5991 }
5992
5993 return !Ops.empty();
5994}
unsigned const MachineRegisterInfo * MRI
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static LLVM_READONLY bool hasSourceMods(const MachineInstr &MI)
static bool isInv2Pi(const APFloat &APF)
static LLVM_READONLY bool opMustUseVOP3Encoding(const MachineInstr &MI, const MachineRegisterInfo &MRI)
returns true if the operation will definitely need to use a 64-bit encoding, and thus will use a VOP3...
static unsigned inverseMinMax(unsigned Opc)
static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, SelectionDAG &DAG)
static unsigned workitemIntrinsicDim(unsigned ID)
static int getOrCreateFixedStackObject(MachineFrameInfo &MFI, unsigned Size, int64_t Offset)
static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset, uint32_t Width, const SDLoc &DL)
static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X, SDValue Y, SDValue C, SDNodeFlags Flags=SDNodeFlags())
static SDValue getAddOneOp(const SDNode *V)
If V is an add of a constant 1, returns the other operand.
#define NODE_NAME_CASE(node)
static LLVM_READONLY bool selectSupportsSourceMods(const SDNode *N)
Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the type for ISD::SELECT.
static cl::opt< bool > AMDGPUBypassSlowDiv("amdgpu-bypass-slow-div", cl::desc("Skip 64-bit divide for dynamic 32-bit values"), cl::init(true))
static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL, SDValue N0, SDValue N1, unsigned Size, bool Signed)
static bool fnegFoldsIntoOp(const SDNode *N)
static bool isI24(SDValue Op, SelectionDAG &DAG)
static bool isCttzOpc(unsigned Opc)
static bool isU24(SDValue Op, SelectionDAG &DAG)
static SDValue peekFPSignOps(SDValue Val)
static bool valueIsKnownNeverF32Denorm(SDValue Src)
Return true if it's known that Src can never be an f32 denormal value.
static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI, unsigned Op, const SDLoc &SL, SDValue Cond, SDValue N1, SDValue N2)
static SDValue peekFNeg(SDValue Val)
static SDValue simplifyMul24(SDNode *Node24, TargetLowering::DAGCombinerInfo &DCI)
static bool isCtlzOpc(unsigned Opc)
static LLVM_READNONE bool fnegFoldsIntoOpcode(unsigned Opc)
static bool hasVolatileUser(SDNode *Val)
Interface definition of the TargetLowering class that is common to all AMD GPUs.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
AMDGPU promote alloca to vector or LDS
Function Alias Analysis Results
block Block Frequency Analysis
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define LLVM_READNONE
Definition: Compiler.h:220
#define LLVM_READONLY
Definition: Compiler.h:227
static cl::opt< unsigned > CostThreshold("dfa-cost-threshold", cl::desc("Maximum cost accepted for the transformation"), cl::Hidden, cl::init(50))
static Error getAddrSpace(StringRef R, unsigned &AddrSpace)
Definition: DataLayout.cpp:266
uint64_t Size
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
Provides analysis for querying information about KnownBits during GISel passes.
IRTranslator LLVM IR MI
static LVOptions Options
Definition: LVOptions.cpp:25
#define I(x, y, z)
Definition: MD5.cpp:58
#define G(x, y, z)
Definition: MD5.cpp:56
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first found DebugLoc that has a DILocation, given a range of instructions.
LLVMContext & Context
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
const char LLVMTargetMachineRef TM
const SmallVectorImpl< MachineOperand > & Cond
#define CH(x, y, z)
Definition: SHA256.cpp:34
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool Enabled
Definition: Statistic.cpp:46
Value * RHS
Value * LHS
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static bool isUniformMMO(const MachineMemOperand *MMO)
static std::optional< uint32_t > getLDSAbsoluteAddress(const GlobalValue &GV)
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV)
bool hasFminFmaxLegacy() const
Align getAlignmentForImplicitArgPtr() const
bool hasMadMacF32Insts() const
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
bool has16BitInsts() const
bool hasFastFMAF32() const
unsigned getExplicitKernelArgOffset() const
Returns the offset in bytes from the start of the input buffer of the first explicit kernel argument.
static const AMDGPUSubtarget & get(const MachineFunction &MF)
bool hasInv2PiInlineImm() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue combineFMinMaxLegacy(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True, SDValue False, SDValue CC, DAGCombinerInfo &DCI) const
Generate Min/Max node.
unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
This method can be implemented by targets that want to expose additional information about sign bits ...
SDValue performMulhuCombine(SDNode *N, DAGCombinerInfo &DCI) const
EVT getTypeForExtReturn(LLVMContext &Context, EVT VT, ISD::NodeType ExtendKind) const override
Return the type that should be used to zero or sign extend a zeroext/signext integer return value.
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const
SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
bool storeOfVectorConstantIsCheap(bool IsZero, EVT MemVT, unsigned NumElem, unsigned AS) const override
Return true if it is expected to be cheaper to do a store of vector constant with the given size and ...
SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool shouldCombineMemoryType(EVT VT) const
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue performAssertSZExtCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isTruncateFree(EVT Src, EVT Dest) const override
bool aggressivelyPreferBuildVectorSources(EVT VecVT) const override
SDValue LowerFCEIL(SDValue Op, SelectionDAG &DAG) const
TargetLowering::NegatibleCost getConstantNegateCost(const ConstantFPSDNode *C) const
SDValue LowerFLOGUnsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, bool IsLog10, SDNodeFlags Flags) const
SDValue performMulhsCombine(SDNode *N, DAGCombinerInfo &DCI) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool isSDNodeAlwaysUniform(const SDNode *N) const override
bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to move this shift by a constant amount through its operand,...
SDValue LowerFREM(SDValue Op, SelectionDAG &DAG) const
Split a vector store into multiple scalar stores.
SDValue performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isCheapToSpeculateCtlz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic ctlz.
SDValue LowerSDIVREM(SDValue Op, SelectionDAG &DAG) const
bool isFNegFree(EVT VT) const override
Return true if an fneg operation is free to the point where it is never worthwhile to replace it with...
SDValue LowerFLOG10(SDValue Op, SelectionDAG &DAG) const
SDValue LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, bool Signed) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
bool isConstantCheaperToNegate(SDValue N) const
bool isReassocProfitable(MachineRegisterInfo &MRI, Register N0, Register N1) const override
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const
SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue performFNegCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
bool isConstantCostlierToNegate(SDValue N) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
SDValue LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const
SDValue lowerFEXP10Unsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, SDNodeFlags Flags) const
Emit approx-funcs appropriate lowering for exp10.
SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const
bool isCheapToSpeculateCttz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic cttz.
SDValue performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond, SDValue LHS, SDValue RHS, DAGCombinerInfo &DCI) const
SDValue performSraCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isSelectSupported(SelectSupportKind) const override
bool isZExtFree(Type *Src, Type *Dest) const override
Return true if any actual instruction that defines a value of type FromTy implicitly zero-extends the...
SDValue lowerFEXP2(SDValue Op, SelectionDAG &DAG) const
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
SDValue performSrlCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue lowerFEXP(SDValue Op, SelectionDAG &DAG) const
SDValue getIsLtSmallestNormal(SelectionDAG &DAG, SDValue Op, SDNodeFlags Flags) const
bool mayIgnoreSignedZero(SDValue Op) const
SDValue getIsFinite(SelectionDAG &DAG, SDValue Op, SDNodeFlags Flags) const
bool isLoadBitCastBeneficial(EVT, EVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const final
Return true if the following transform is beneficial: fold (conv (load x)) -> (load (conv*)x) On arch...
bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtType, EVT ExtVT) const override
Return true if it is profitable to reduce a load to a smaller type.
MVT getVectorIdxTy(const DataLayout &) const override
Returns the type to be used for the index operand of: ISD::INSERT_VECTOR_ELT, ISD::EXTRACT_VECTOR_ELT...
std::pair< SDValue, SDValue > splitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HighVT, SelectionDAG &DAG) const
Split a vector value into two parts of types LoVT and HiVT.
SDValue LowerFLOGCommon(SDValue Op, SelectionDAG &DAG) const
SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI, SDValue N) const
bool shouldSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Whether it is profitable to sink the operands of an Instruction I to the basic block of I.
SDValue LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG, bool Signed) const
bool isFAbsFree(EVT VT) const override
Return true if an fabs operation is free to the point where it is never worthwhile to replace it with...
SDValue loadStackInputValue(SelectionDAG &DAG, EVT VT, const SDLoc &SL, int64_t Offset) const
Similar to CreateLiveInRegister, except value maybe loaded from a stack slot rather than passed in a ...
bool isNarrowingProfitable(EVT SrcVT, EVT DestVT) const override
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
SDValue LowerFLOG2(SDValue Op, SelectionDAG &DAG) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &RefinementSteps, bool &UseOneConstNR, bool Reciprocal) const override
Hooks for building estimates in place of slower divisions and square roots.
unsigned computeNumSignBitsForTargetInstr(GISelKnownBits &Analysis, Register R, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
This method can be implemented by targets that want to expose additional information about sign bits ...
SDValue performTruncateCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const
static SDValue stripBitcast(SDValue Val)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
SDValue LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const
SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize, NegatibleCost &Cost, unsigned Depth) const override
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &RefinementSteps) const override
Return a reciprocal estimate value for the input operand.
AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI)
SDValue LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const
const char * getTargetNodeName(unsigned Opcode) const override
This method returns the name of a target specific DAG node.
SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
std::pair< SDValue, SDValue > getScaledLogInput(SelectionDAG &DAG, const SDLoc SL, SDValue Op, SDNodeFlags Flags) const
If denormal handling is required return the scaled input to FLOG2, and the check for denormal range.
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold=4)
SDValue LowerFROUNDEVEN(SDValue Op, SelectionDAG &DAG) const
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override
Returns true if the target can instruction select the specified FP immediate natively.
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
SDValue lowerFEXPUnsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, SDNodeFlags Flags) const
SDValue LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
bool ShouldShrinkFPConstant(EVT VT) const override
If true, then instruction selection should seek to shrink the FP constant of the specified type to a ...
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue getLoHalf64(SDValue Op, SelectionDAG &DAG) const
SDValue lowerCTLZResults(SDValue Op, SelectionDAG &DAG) const
SDValue performFAbsCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerFP_TO_INT64(SDValue Op, SelectionDAG &DAG, bool Signed) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
SDValue LowerFRINT(SDValue Op, SelectionDAG &DAG) const
SDValue performIntrinsicWOChainCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const
SDValue performMulLoHiCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
void LowerUDIVREM64(SDValue Op, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results) const
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
std::pair< EVT, EVT > getSplitDestVTs(const EVT &VT, SelectionDAG &DAG) const
Split a vector type into two parts.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
SDValue combineFMinMaxLegacyImpl(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True, SDValue False, SDValue CC, DAGCombinerInfo &DCI) const
bool bitwiseIsEqual(const APFloat &RHS) const
Definition: APFloat.h:1260
opStatus add(const APFloat &RHS, roundingMode RM)
Definition: APFloat.h:1042
const fltSemantics & getSemantics() const
Definition: APFloat.h:1303
opStatus multiply(const APFloat &RHS, roundingMode RM)
Definition: APFloat.h:1060
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
Definition: APFloat.h:1026
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition: APFloat.h:966
Class for arbitrary precision integers.
Definition: APInt.h:76
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1491
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition: APInt.h:1370
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition: APInt.h:1364
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition: APInt.h:236
bool ule(const APInt &RHS) const
Unsigned less or equal comparison.
Definition: APInt.h:1128
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition: APInt.h:284
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:274
void setLowBits(unsigned loBits)
Set the bottom loBits bits.
Definition: APInt.h:1367
This class represents an incoming formal argument to a Function.
Definition: Argument.h:28
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:748
@ FAdd
*p = old + v
Definition: Instructions.h:785
@ FSub
*p = old - v
Definition: Instructions.h:788
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
Definition: Instructions.h:796
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
Definition: Instructions.h:792
@ Nand
*p = ~(old & v)
Definition: Instructions.h:770
BinOp getOperation() const
Definition: Instructions.h:845
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
LLVMContext & getContext() const
void addLoc(const CCValAssign &V)
static CCValAssign getCustomMem(unsigned ValNo, MVT ValVT, int64_t Offset, MVT LocVT, LocInfo HTP)
const APFloat & getValueAPF() const
bool isNegative() const
Return true if the value is negative.
uint64_t getZExtValue() const
This class represents an Operation in the Expression.
bool print(raw_ostream &OS, DIDumpOptions DumpOpts, const DWARFExpression *Expr, DWARFUnit *U) const
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
Diagnostic information for unsupported feature in backend.
iterator_range< arg_iterator > args()
Definition: Function.h:837
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:262
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:655
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
Machine Value Type.
static auto integer_fixedlen_vector_valuetypes()
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
static auto integer_valuetypes()
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
int getObjectIndexBegin() const
Return the minimum frame object index.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Representation of each machine instruction.
Definition: MachineInstr.h:69
A description of a memory reference used in the backend.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOInvariant
The memory access always returns the same value (or traps).
Flags getFlags() const
Return the raw flags of the source value,.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
bool isSimple() const
Returns true if the memory operation is neither atomic or volatile.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getChain() const
bool isInvariant() const
EVT getMemoryVT() const
Return the type of the in-memory value.
LLVMContext & getContext() const
Get the global data context.
Definition: Module.h:295
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.h:287
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
iterator_range< use_iterator > uses()
SDNodeFlags getFlags() const
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
op_iterator op_end() const
op_iterator op_begin() const
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getOpcode() const
unsigned getNumOperands() const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
SIModeRegisterDefaults getMode() const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:225
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
unsigned ComputeMaxSignificantBits(SDValue Op, unsigned Depth=0) const
Get the upper bound on bit size for this Value Op as a signed integer.
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
Definition: SelectionDAG.h:551
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS)
Helper function to make it easier to build Select's if you just have operands and don't want to check...
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
Definition: SelectionDAG.h:478
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
Definition: SelectionDAG.h:828
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
Definition: SelectionDAG.h:472
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
SDValue getRegister(unsigned Reg, EVT VT)
bool isConstantValueOfAnyType(SDValue N) const
SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
static const fltSemantics & EVTToAPFloatSemantics(EVT VT)
Returns an APFloat semantics tag appropriate for the given type.
const TargetMachine & getTarget() const
Definition: SelectionDAG.h:473
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond)
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
SDValue getValueType(EVT)
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
bool isKnownNeverNaN(SDValue Op, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:676
unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:469
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, unsigned Reg, EVT VT)
Definition: SelectionDAG.h:799
SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
Definition: SelectionDAG.h:485
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
Definition: SelectionDAG.h:560
SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL, bool LegalTypes=true)
SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition: SelectionDAG.h:554
std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
bool equals(StringRef RHS) const
equals - Check for string equality, this is more efficient than compare() when the relative ordering ...
Definition: StringRef.h:164
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void addBypassSlowDiv(unsigned int SlowBitWidth, unsigned int FastBitWidth)
Tells the code generator which bitwidths to bypass.
void setMaxLargeFPConvertBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum fp convert the backend supports.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
SelectSupportKind
Enum that describes what type of support for selects the target has.
virtual bool allowsMisalignedMemoryAccesses(EVT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *=nullptr) const
Determine if the target supports unaligned memory accesses.
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
virtual EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const
Return the ValueType of the result of SETCC operations.
EVT getShiftAmountTy(EVT LHSTy, const DataLayout &DL, bool LegalTypes=true) const
Returns the type for the shift amount of a shift opcode.
virtual EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const
For types supported by the target, this is an identity function.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
void setSupportsUnalignedAtomics(bool UnalignedSupported)
Sets whether unaligned atomic operations are supported.
void setLibcallName(RTLIB::Libcall Call, const char *Name)
Rename the default libcall routine name for the specified libcall.
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
virtual bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT) const
Return true if it is profitable to reduce a load to a smaller type.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
void setMinCmpXchgSizeInBits(unsigned SizeInBits)
Sets the minimum cmpxchg or ll/sc size supported by the backend.
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
unsigned GatherAllAliasesMaxDepth
Depth that GatherAllAliases should continue looking for chain dependencies when trying to find a more...
NegatibleCost
Enum that specifies when a float negation is beneficial.
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
void setHasMultipleConditionRegisters(bool hasManyRegs=true)
Tells the code generator that the target has multiple (allocatable) condition registers that can be u...
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
void setJumpIsExpensive(bool isExpensive=true)
Tells the code generator not to expand logic operations on comparison predicates into separate sequen...
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
bool ShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const
Check to see if the specified operand of the specified instruction is a constant integer.
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
virtual SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOps, bool OptForSize, NegatibleCost &Cost, unsigned Depth=0) const
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:76
TargetOptions Options
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:330
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
Definition: CallingConv.h:197
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
Definition: CallingConv.h:188
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ AMDGPU_Gfx
Used for AMD graphics targets.
Definition: CallingConv.h:232
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:249
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
Definition: CallingConv.h:206
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
Definition: CallingConv.h:191
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:245
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:194
@ Cold
Attempts to make code in the caller as efficient as possible under the assumption that the call is no...
Definition: CallingConv.h:47
@ SPIR_KERNEL
Used for SPIR kernel functions.
Definition: CallingConv.h:144
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
Definition: CallingConv.h:218
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
Definition: CallingConv.h:213
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition: ISDOpcodes.h:40
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:750
@ CTLZ_ZERO_UNDEF
Definition: ISDOpcodes.h:723
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:250
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition: ISDOpcodes.h:559
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:714
@ ConstantFP
Definition: ISDOpcodes.h:77
@ ADDC
Carry-setting nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:269
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition: ISDOpcodes.h:487
@ FMAXNUM_IEEE
Definition: ISDOpcodes.h:979
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:239
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1031
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition: ISDOpcodes.h:783
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:483
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:790
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition: ISDOpcodes.h:543
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:390
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:255
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
Definition: ISDOpcodes.h:913
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:903
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition: ISDOpcodes.h:229
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
Definition: ISDOpcodes.h:939
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:774
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition: ISDOpcodes.h:722
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:930
@ BRIND
BRIND - Indirect branch.
Definition: ISDOpcodes.h:1052
@ BR_JT
BR_JT - Jumptable branch.
Definition: ISDOpcodes.h:1056
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition: ISDOpcodes.h:500
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
Definition: ISDOpcodes.h:507
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:727
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition: ISDOpcodes.h:222
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:651
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:705
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:600
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition: ISDOpcodes.h:573
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimum or maximum on two values,...
Definition: ISDOpcodes.h:978
@ EntryToken
EntryToken - This is the marker used to indicate the start of a region.
Definition: ISDOpcodes.h:47
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:535
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:203
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:780
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition: ISDOpcodes.h:742
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:971
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
Definition: ISDOpcodes.h:1041
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:798
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:674
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:888
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition: ISDOpcodes.h:736
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:303
@ INLINEASM_BR
INLINEASM_BR - Branching version of inline asm. Used by asm-goto.
Definition: ISDOpcodes.h:1097
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:984
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:836
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:680
@ TRAP
TRAP - Trapping instruction.
Definition: ISDOpcodes.h:1208
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:184
@ ADDE
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:279
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition: ISDOpcodes.h:524
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition: ISDOpcodes.h:52
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
Definition: ISDOpcodes.h:944
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:869
@ INLINEASM
INLINEASM - Represents an inline asm block.
Definition: ISDOpcodes.h:1094
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:786
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition: ISDOpcodes.h:61
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition: ISDOpcodes.h:493
@ AssertZext
Definition: ISDOpcodes.h:62
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:192
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition: ISDOpcodes.h:515
bool isNormalStore(const SDNode *N)
Returns true if the specified node is a non-truncating and unindexed store.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1523
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
Definition: ISDOpcodes.h:1503
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
m_Intrinsic_Ty< Opnd0 >::Ty m_FAbs(const Opnd0 &Op0)
Libcall
RTLIB::Libcall enum - This enum defines all of the runtime library calls the backend can emit.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
constexpr double ln2
Definition: MathExtras.h:33
constexpr double ln10
Definition: MathExtras.h:34
constexpr float log2ef
Definition: MathExtras.h:50
constexpr double log2e
Definition: MathExtras.h:35
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Offset
Definition: DWP.cpp:456
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1731
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
bool getAlign(const Function &F, unsigned index, unsigned &align)
ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:361
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1738
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition: bit.h:281
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:136
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:156
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:141
raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
CombineLevel
Definition: DAGCombine.h:15
@ AfterLegalizeDAG
Definition: DAGCombine.h:19
@ AfterLegalizeTypes
Definition: DAGCombine.h:17
@ Mul
Product of integers.
@ And
Bitwise or logical AND of integers.
@ Add
Sum of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
DWARFExpression::Operation Op
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition: Analysis.cpp:79
ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:191
@ DS_Warning
bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
APFloat neg(APFloat X)
Returns the negated value of the argument.
Definition: APFloat.h:1387
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition: Alignment.h:208
bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
static const fltSemantics & IEEEsingle() LLVM_READNONE
Definition: APFloat.cpp:249
static constexpr roundingMode rmNearestTiesToEven
Definition: APFloat.h:230
static const fltSemantics & IEEEdouble() LLVM_READNONE
Definition: APFloat.cpp:250
static const fltSemantics & IEEEhalf() LLVM_READNONE
Definition: APFloat.cpp:247
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
MCRegister getRegister() const
unsigned getStackOffset() const
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
static constexpr DenormalMode getPreserveSign()
Extended Value Type.
Definition: ValueTypes.h:34
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition: ValueTypes.h:380
EVT getPow2VectorType(LLVMContext &Context) const
Widens the length of the given vector EVT up to the nearest power of 2 and returns that type.
Definition: ValueTypes.h:462
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:136
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition: ValueTypes.h:73
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition: ValueTypes.h:120
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:146
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:358
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition: ValueTypes.h:233
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:370
EVT getHalfSizedIntegerVT(LLVMContext &Context) const
Finds the smallest simple value type that is greater than or equal to half the width of this EVT.
Definition: ValueTypes.h:415
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition: ValueTypes.h:455
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition: ValueTypes.h:397
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:306
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition: ValueTypes.h:64
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition: ValueTypes.h:366
EVT getRoundIntegerType(LLVMContext &Context) const
Rounds the bit-width of the given integer EVT up to the nearest power of two (and at least to eight),...
Definition: ValueTypes.h:404
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:167
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:313
bool bitsGE(EVT VT) const
Return true if this has no less bits than VT.
Definition: ValueTypes.h:282
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:318
bool isExtended() const
Test if the given EVT is extended (as opposed to being simple).
Definition: ValueTypes.h:141
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:326
bool bitsLE(EVT VT) const
Return true if this has no more bits than VT.
Definition: ValueTypes.h:298
bool isNonNegative() const
Returns true if this value is known to be non-negative.
Definition: KnownBits.h:104
unsigned countMinTrailingZeros() const
Returns the minimum number of trailing zero bits.
Definition: KnownBits.h:238
bool isUnknown() const
Returns true if we don't know any bits.
Definition: KnownBits.h:63
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
Definition: KnownBits.h:157
unsigned getBitWidth() const
Get the bit width of this value.
Definition: KnownBits.h:40
void resetAll()
Resets the known state of all bits.
Definition: KnownBits.h:71
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition: KnownBits.h:292
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition: KnownBits.h:244
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition: KnownBits.h:141
bool isStrictlyPositive() const
Returns true if this value is known to be positive.
Definition: KnownBits.h:110
bool isNegative() const
Returns true if this value is known to be negative.
Definition: KnownBits.h:101
unsigned countMaxSignificantBits() const
Returns the maximum number of bits needed to represent all possible signed values with these known bi...
Definition: KnownBits.h:265
This class contains a discriminated union of information about pointers in memory operands,...
bool isDereferenceable(unsigned Size, LLVMContext &C, const DataLayout &DL) const
Return true if memory region [V, V+Offset+Size) is known to be dereferenceable.
static MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
MachinePointerInfo getWithOffset(int64_t O) const
These are IR-level optimization flags that may be propagated to SDNodes.
void setAllowContract(bool b)
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO)
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...