LLVM 19.0.0git
AMDGPUISelLowering.cpp
Go to the documentation of this file.
1//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This is the parent TargetLowering class for hardware code gen
11/// targets.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPUISelLowering.h"
16#include "AMDGPU.h"
17#include "AMDGPUInstrInfo.h"
24#include "llvm/IR/IntrinsicsAMDGPU.h"
29
30using namespace llvm;
31
32#include "AMDGPUGenCallingConv.inc"
33
35 "amdgpu-bypass-slow-div",
36 cl::desc("Skip 64-bit divide for dynamic 32-bit values"),
37 cl::init(true));
38
39// Find a larger type to do a load / store of a vector with.
41 unsigned StoreSize = VT.getStoreSizeInBits();
42 if (StoreSize <= 32)
43 return EVT::getIntegerVT(Ctx, StoreSize);
44
45 assert(StoreSize % 32 == 0 && "Store size not a multiple of 32");
46 return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
47}
48
51}
52
54 // In order for this to be a signed 24-bit value, bit 23, must
55 // be a sign bit.
56 return DAG.ComputeMaxSignificantBits(Op);
57}
58
60 const AMDGPUSubtarget &STI)
61 : TargetLowering(TM), Subtarget(&STI) {
62 // Always lower memset, memcpy, and memmove intrinsics to load/store
63 // instructions, rather then generating calls to memset, mempcy or memmove.
67
68 // Lower floating point store/load to integer store/load to reduce the number
69 // of patterns in tablegen.
71 AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
72
74 AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
75
77 AddPromotedToType(ISD::LOAD, MVT::v3f32, MVT::v3i32);
78
80 AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
81
83 AddPromotedToType(ISD::LOAD, MVT::v5f32, MVT::v5i32);
84
86 AddPromotedToType(ISD::LOAD, MVT::v6f32, MVT::v6i32);
87
89 AddPromotedToType(ISD::LOAD, MVT::v7f32, MVT::v7i32);
90
92 AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
93
95 AddPromotedToType(ISD::LOAD, MVT::v9f32, MVT::v9i32);
96
98 AddPromotedToType(ISD::LOAD, MVT::v10f32, MVT::v10i32);
99
100 setOperationAction(ISD::LOAD, MVT::v11f32, Promote);
101 AddPromotedToType(ISD::LOAD, MVT::v11f32, MVT::v11i32);
102
103 setOperationAction(ISD::LOAD, MVT::v12f32, Promote);
104 AddPromotedToType(ISD::LOAD, MVT::v12f32, MVT::v12i32);
105
106 setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
107 AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
108
109 setOperationAction(ISD::LOAD, MVT::v32f32, Promote);
110 AddPromotedToType(ISD::LOAD, MVT::v32f32, MVT::v32i32);
111
113 AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
114
116 AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);
117
119 AddPromotedToType(ISD::LOAD, MVT::f64, MVT::v2i32);
120
122 AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32);
123
125 AddPromotedToType(ISD::LOAD, MVT::v3i64, MVT::v6i32);
126
128 AddPromotedToType(ISD::LOAD, MVT::v4i64, MVT::v8i32);
129
131 AddPromotedToType(ISD::LOAD, MVT::v3f64, MVT::v6i32);
132
134 AddPromotedToType(ISD::LOAD, MVT::v4f64, MVT::v8i32);
135
137 AddPromotedToType(ISD::LOAD, MVT::v8i64, MVT::v16i32);
138
140 AddPromotedToType(ISD::LOAD, MVT::v8f64, MVT::v16i32);
141
142 setOperationAction(ISD::LOAD, MVT::v16i64, Promote);
143 AddPromotedToType(ISD::LOAD, MVT::v16i64, MVT::v32i32);
144
145 setOperationAction(ISD::LOAD, MVT::v16f64, Promote);
146 AddPromotedToType(ISD::LOAD, MVT::v16f64, MVT::v32i32);
147
149 AddPromotedToType(ISD::LOAD, MVT::i128, MVT::v4i32);
150
151 // There are no 64-bit extloads. These should be done as a 32-bit extload and
152 // an extension to 64-bit.
153 for (MVT VT : MVT::integer_valuetypes())
155 Expand);
156
157 for (MVT VT : MVT::integer_valuetypes()) {
158 if (VT == MVT::i64)
159 continue;
160
161 for (auto Op : {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}) {
162 setLoadExtAction(Op, VT, MVT::i1, Promote);
163 setLoadExtAction(Op, VT, MVT::i8, Legal);
164 setLoadExtAction(Op, VT, MVT::i16, Legal);
165 setLoadExtAction(Op, VT, MVT::i32, Expand);
166 }
167 }
168
170 for (auto MemVT :
171 {MVT::v2i8, MVT::v4i8, MVT::v2i16, MVT::v3i16, MVT::v4i16})
173 Expand);
174
175 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
176 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand);
177 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
178 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2bf16, Expand);
179 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3f16, Expand);
180 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3bf16, Expand);
181 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
182 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4bf16, Expand);
183 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
184 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8bf16, Expand);
185 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Expand);
186 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16bf16, Expand);
187 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32f16, Expand);
188 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32bf16, Expand);
189
190 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
191 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
192 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f32, Expand);
193 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
194 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand);
195 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f32, Expand);
196
197 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
198 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand);
199 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
200 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2bf16, Expand);
201 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f16, Expand);
202 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3bf16, Expand);
203 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
204 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4bf16, Expand);
205 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
206 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8bf16, Expand);
207 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f16, Expand);
208 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16bf16, Expand);
209
211 AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
212
214 AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
215
217 AddPromotedToType(ISD::STORE, MVT::v3f32, MVT::v3i32);
218
220 AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
221
223 AddPromotedToType(ISD::STORE, MVT::v5f32, MVT::v5i32);
224
226 AddPromotedToType(ISD::STORE, MVT::v6f32, MVT::v6i32);
227
229 AddPromotedToType(ISD::STORE, MVT::v7f32, MVT::v7i32);
230
232 AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
233
235 AddPromotedToType(ISD::STORE, MVT::v9f32, MVT::v9i32);
236
238 AddPromotedToType(ISD::STORE, MVT::v10f32, MVT::v10i32);
239
241 AddPromotedToType(ISD::STORE, MVT::v11f32, MVT::v11i32);
242
244 AddPromotedToType(ISD::STORE, MVT::v12f32, MVT::v12i32);
245
247 AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
248
250 AddPromotedToType(ISD::STORE, MVT::v32f32, MVT::v32i32);
251
253 AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
254
256 AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);
257
259 AddPromotedToType(ISD::STORE, MVT::f64, MVT::v2i32);
260
262 AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
263
265 AddPromotedToType(ISD::STORE, MVT::v3i64, MVT::v6i32);
266
268 AddPromotedToType(ISD::STORE, MVT::v3f64, MVT::v6i32);
269
271 AddPromotedToType(ISD::STORE, MVT::v4i64, MVT::v8i32);
272
274 AddPromotedToType(ISD::STORE, MVT::v4f64, MVT::v8i32);
275
277 AddPromotedToType(ISD::STORE, MVT::v8i64, MVT::v16i32);
278
280 AddPromotedToType(ISD::STORE, MVT::v8f64, MVT::v16i32);
281
283 AddPromotedToType(ISD::STORE, MVT::v16i64, MVT::v32i32);
284
286 AddPromotedToType(ISD::STORE, MVT::v16f64, MVT::v32i32);
287
289 AddPromotedToType(ISD::STORE, MVT::i128, MVT::v4i32);
290
291 setTruncStoreAction(MVT::i64, MVT::i1, Expand);
292 setTruncStoreAction(MVT::i64, MVT::i8, Expand);
293 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
294 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
295
296 setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
297 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Expand);
298 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand);
299 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
300
301 setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
302 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
303 setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
304 setTruncStoreAction(MVT::v3f32, MVT::v3f16, Expand);
305 setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
306 setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
307 setTruncStoreAction(MVT::v16f32, MVT::v16f16, Expand);
308 setTruncStoreAction(MVT::v32f32, MVT::v32f16, Expand);
309
310 setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
311 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
312 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
313
314 setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
315 setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
316
317 setTruncStoreAction(MVT::v3i32, MVT::v3i8, Expand);
318
319 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
320 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
321 setTruncStoreAction(MVT::v3i64, MVT::v3i8, Expand);
322 setTruncStoreAction(MVT::v3i64, MVT::v3i1, Expand);
323 setTruncStoreAction(MVT::v3f64, MVT::v3f32, Expand);
324 setTruncStoreAction(MVT::v3f64, MVT::v3f16, Expand);
325
326 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Expand);
327 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Expand);
328 setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
329 setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);
330
331 setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);
332 setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);
333
334 setTruncStoreAction(MVT::v16f64, MVT::v16f32, Expand);
335 setTruncStoreAction(MVT::v16f64, MVT::v16f16, Expand);
336 setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
337 setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
338 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
339 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
340 setTruncStoreAction(MVT::v16i64, MVT::v16i1, Expand);
341
342 setOperationAction(ISD::Constant, {MVT::i32, MVT::i64}, Legal);
343 setOperationAction(ISD::ConstantFP, {MVT::f32, MVT::f64}, Legal);
344
346
347 // For R600, this is totally unsupported, just custom lower to produce an
348 // error.
350
351 // Library functions. These default to Expand, but we have instructions
352 // for them.
355 MVT::f32, Legal);
356
358 setOperationAction(ISD::FROUND, {MVT::f32, MVT::f64}, Custom);
359
362 Custom);
363
364 setOperationAction(ISD::FNEARBYINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);
365
366 setOperationAction(ISD::FRINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);
367
368 setOperationAction(ISD::FREM, {MVT::f16, MVT::f32, MVT::f64}, Custom);
369
370 if (Subtarget->has16BitInsts())
371 setOperationAction(ISD::IS_FPCLASS, {MVT::f16, MVT::f32, MVT::f64}, Legal);
372 else {
373 setOperationAction(ISD::IS_FPCLASS, {MVT::f32, MVT::f64}, Legal);
375 }
376
378 Custom);
379
380 // FIXME: These IS_FPCLASS vector fp types are marked custom so it reaches
381 // scalarization code. Can be removed when IS_FPCLASS expand isn't called by
382 // default unless marked custom/legal.
385 {MVT::v2f16, MVT::v3f16, MVT::v4f16, MVT::v16f16, MVT::v2f32, MVT::v3f32,
386 MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v16f32,
387 MVT::v2f64, MVT::v3f64, MVT::v4f64, MVT::v8f64, MVT::v16f64},
388 Custom);
389
390 // Expand to fneg + fadd.
392
394 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32,
395 MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
396 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
397 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
398 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
399 Custom);
400
401 // FIXME: Why is v8f16/v8bf16 missing?
404 {MVT::v2f16, MVT::v2bf16, MVT::v2i16, MVT::v4f16, MVT::v4bf16,
405 MVT::v4i16, MVT::v2f32, MVT::v2i32, MVT::v3f32, MVT::v3i32,
406 MVT::v4f32, MVT::v4i32, MVT::v5f32, MVT::v5i32, MVT::v6f32,
407 MVT::v6i32, MVT::v7f32, MVT::v7i32, MVT::v8f32, MVT::v8i32,
408 MVT::v9f32, MVT::v9i32, MVT::v10i32, MVT::v10f32, MVT::v11i32,
409 MVT::v11f32, MVT::v12i32, MVT::v12f32, MVT::v16f16, MVT::v16bf16,
410 MVT::v16i16, MVT::v16f32, MVT::v16i32, MVT::v32f32, MVT::v32i32,
411 MVT::v2f64, MVT::v2i64, MVT::v3f64, MVT::v3i64, MVT::v4f64,
412 MVT::v4i64, MVT::v8f64, MVT::v8i64, MVT::v16f64, MVT::v16i64,
413 MVT::v32i16, MVT::v32f16, MVT::v32bf16},
414 Custom);
415
417 setOperationAction(ISD::FP_TO_FP16, {MVT::f64, MVT::f32}, Custom);
418
419 const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
420 for (MVT VT : ScalarIntVTs) {
421 // These should use [SU]DIVREM, so set them to expand
423 Expand);
424
425 // GPU does not have divrem function for signed or unsigned.
427
428 // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
430
432
433 // AMDGPU uses ADDC/SUBC/ADDE/SUBE
435 }
436
437 // The hardware supports 32-bit FSHR, but not FSHL.
439
440 // The hardware supports 32-bit ROTR, but not ROTL.
441 setOperationAction(ISD::ROTL, {MVT::i32, MVT::i64}, Expand);
443
445
449 MVT::i64, Custom);
451
453 Legal);
454
457 MVT::i64, Custom);
458
459 for (auto VT : {MVT::i8, MVT::i16})
461
462 static const MVT::SimpleValueType VectorIntTypes[] = {
463 MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32,
464 MVT::v9i32, MVT::v10i32, MVT::v11i32, MVT::v12i32};
465
466 for (MVT VT : VectorIntTypes) {
467 // Expand the following operations for the current type by default.
479 ISD::SETCC},
480 VT, Expand);
481 }
482
483 static const MVT::SimpleValueType FloatVectorTypes[] = {
484 MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32,
485 MVT::v9f32, MVT::v10f32, MVT::v11f32, MVT::v12f32};
486
487 for (MVT VT : FloatVectorTypes) {
500 VT, Expand);
501 }
502
503 // This causes using an unrolled select operation rather than expansion with
504 // bit operations. This is in general better, but the alternative using BFI
505 // instructions may be better if the select sources are SGPRs.
507 AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32);
508
510 AddPromotedToType(ISD::SELECT, MVT::v3f32, MVT::v3i32);
511
513 AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32);
514
516 AddPromotedToType(ISD::SELECT, MVT::v5f32, MVT::v5i32);
517
519 AddPromotedToType(ISD::SELECT, MVT::v6f32, MVT::v6i32);
520
522 AddPromotedToType(ISD::SELECT, MVT::v7f32, MVT::v7i32);
523
525 AddPromotedToType(ISD::SELECT, MVT::v9f32, MVT::v9i32);
526
528 AddPromotedToType(ISD::SELECT, MVT::v10f32, MVT::v10i32);
529
531 AddPromotedToType(ISD::SELECT, MVT::v11f32, MVT::v11i32);
532
534 AddPromotedToType(ISD::SELECT, MVT::v12f32, MVT::v12i32);
535
536 // Disable most libcalls.
537 for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I) {
538 if (I < RTLIB::ATOMIC_LOAD || I > RTLIB::ATOMIC_FETCH_NAND_16)
539 setLibcallName(static_cast<RTLIB::Libcall>(I), nullptr);
540 }
541
543 setJumpIsExpensive(true);
544
545 // FIXME: This is only partially true. If we have to do vector compares, any
546 // SGPR pair can be a condition register. If we have a uniform condition, we
547 // are better off doing SALU operations, where there is only one SCC. For now,
548 // we don't have a way of knowing during instruction selection if a condition
549 // will be uniform and we always use vector compares. Assume we are using
550 // vector compares until that is fixed.
552
555
557
558 // We want to find all load dependencies for long chains of stores to enable
559 // merging into very wide vectors. The problem is with vectors with > 4
560 // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
561 // vectors are a legal type, even though we have to split the loads
562 // usually. When we can more precisely specify load legality per address
563 // space, we should be able to make FindBetterChain/MergeConsecutiveStores
564 // smarter so that they can figure out what to do in 2 iterations without all
565 // N > 4 stores on the same chain.
567
568 // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
569 // about these during lowering.
570 MaxStoresPerMemcpy = 0xffffffff;
571 MaxStoresPerMemmove = 0xffffffff;
572 MaxStoresPerMemset = 0xffffffff;
573
574 // The expansion for 64-bit division is enormous.
576 addBypassSlowDiv(64, 32);
577
588
592}
593
595 if (getTargetMachine().Options.NoSignedZerosFPMath)
596 return true;
597
598 const auto Flags = Op.getNode()->getFlags();
599 if (Flags.hasNoSignedZeros())
600 return true;
601
602 return false;
603}
604
605//===----------------------------------------------------------------------===//
606// Target Information
607//===----------------------------------------------------------------------===//
608
610static bool fnegFoldsIntoOpcode(unsigned Opc) {
611 switch (Opc) {
612 case ISD::FADD:
613 case ISD::FSUB:
614 case ISD::FMUL:
615 case ISD::FMA:
616 case ISD::FMAD:
617 case ISD::FMINNUM:
618 case ISD::FMAXNUM:
621 case ISD::FMINIMUM:
622 case ISD::FMAXIMUM:
623 case ISD::SELECT:
624 case ISD::FSIN:
625 case ISD::FTRUNC:
626 case ISD::FRINT:
627 case ISD::FNEARBYINT:
628 case ISD::FROUNDEVEN:
630 case AMDGPUISD::RCP:
637 case AMDGPUISD::FMED3:
638 // TODO: handle llvm.amdgcn.fma.legacy
639 return true;
640 case ISD::BITCAST:
641 llvm_unreachable("bitcast is special cased");
642 default:
643 return false;
644 }
645}
646
647static bool fnegFoldsIntoOp(const SDNode *N) {
648 unsigned Opc = N->getOpcode();
649 if (Opc == ISD::BITCAST) {
650 // TODO: Is there a benefit to checking the conditions performFNegCombine
651 // does? We don't for the other cases.
652 SDValue BCSrc = N->getOperand(0);
653 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
654 return BCSrc.getNumOperands() == 2 &&
655 BCSrc.getOperand(1).getValueSizeInBits() == 32;
656 }
657
658 return BCSrc.getOpcode() == ISD::SELECT && BCSrc.getValueType() == MVT::f32;
659 }
660
661 return fnegFoldsIntoOpcode(Opc);
662}
663
664/// \p returns true if the operation will definitely need to use a 64-bit
665/// encoding, and thus will use a VOP3 encoding regardless of the source
666/// modifiers.
668static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
669 return (N->getNumOperands() > 2 && N->getOpcode() != ISD::SELECT) ||
670 VT == MVT::f64;
671}
672
673/// Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the
674/// type for ISD::SELECT.
676static bool selectSupportsSourceMods(const SDNode *N) {
677 // TODO: Only applies if select will be vector
678 return N->getValueType(0) == MVT::f32;
679}
680
681// Most FP instructions support source modifiers, but this could be refined
682// slightly.
684static bool hasSourceMods(const SDNode *N) {
685 if (isa<MemSDNode>(N))
686 return false;
687
688 switch (N->getOpcode()) {
689 case ISD::CopyToReg:
690 case ISD::FDIV:
691 case ISD::FREM:
692 case ISD::INLINEASM:
696
697 // TODO: Should really be looking at the users of the bitcast. These are
698 // problematic because bitcasts are used to legalize all stores to integer
699 // types.
700 case ISD::BITCAST:
701 return false;
703 switch (N->getConstantOperandVal(0)) {
704 case Intrinsic::amdgcn_interp_p1:
705 case Intrinsic::amdgcn_interp_p2:
706 case Intrinsic::amdgcn_interp_mov:
707 case Intrinsic::amdgcn_interp_p1_f16:
708 case Intrinsic::amdgcn_interp_p2_f16:
709 return false;
710 default:
711 return true;
712 }
713 }
714 case ISD::SELECT:
716 default:
717 return true;
718 }
719}
720
722 unsigned CostThreshold) {
723 // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
724 // it is truly free to use a source modifier in all cases. If there are
725 // multiple users but for each one will necessitate using VOP3, there will be
726 // a code size increase. Try to avoid increasing code size unless we know it
727 // will save on the instruction count.
728 unsigned NumMayIncreaseSize = 0;
729 MVT VT = N->getValueType(0).getScalarType().getSimpleVT();
730
731 assert(!N->use_empty());
732
733 // XXX - Should this limit number of uses to check?
734 for (const SDNode *U : N->uses()) {
735 if (!hasSourceMods(U))
736 return false;
737
738 if (!opMustUseVOP3Encoding(U, VT)) {
739 if (++NumMayIncreaseSize > CostThreshold)
740 return false;
741 }
742 }
743
744 return true;
745}
746
748 ISD::NodeType ExtendKind) const {
749 assert(!VT.isVector() && "only scalar expected");
750
751 // Round to the next multiple of 32-bits.
752 unsigned Size = VT.getSizeInBits();
753 if (Size <= 32)
754 return MVT::i32;
755 return EVT::getIntegerVT(Context, 32 * ((Size + 31) / 32));
756}
757
759 return MVT::i32;
760}
761
763 return true;
764}
765
766// The backend supports 32 and 64 bit floating point immediates.
767// FIXME: Why are we reporting vectors of FP immediates as legal?
769 bool ForCodeSize) const {
770 EVT ScalarVT = VT.getScalarType();
771 return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 ||
772 (ScalarVT == MVT::f16 && Subtarget->has16BitInsts()));
773}
774
775// We don't want to shrink f64 / f32 constants.
777 EVT ScalarVT = VT.getScalarType();
778 return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
779}
780
782 ISD::LoadExtType ExtTy,
783 EVT NewVT) const {
784 // TODO: This may be worth removing. Check regression tests for diffs.
786 return false;
787
788 unsigned NewSize = NewVT.getStoreSizeInBits();
789
790 // If we are reducing to a 32-bit load or a smaller multi-dword load,
791 // this is always better.
792 if (NewSize >= 32)
793 return true;
794
795 EVT OldVT = N->getValueType(0);
796 unsigned OldSize = OldVT.getStoreSizeInBits();
797
798 MemSDNode *MN = cast<MemSDNode>(N);
799 unsigned AS = MN->getAddressSpace();
800 // Do not shrink an aligned scalar load to sub-dword.
801 // Scalar engine cannot do sub-dword loads.
802 // TODO: Update this for GFX12 which does have scalar sub-dword loads.
803 if (OldSize >= 32 && NewSize < 32 && MN->getAlign() >= Align(4) &&
806 (isa<LoadSDNode>(N) && AS == AMDGPUAS::GLOBAL_ADDRESS &&
807 MN->isInvariant())) &&
809 return false;
810
811 // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
812 // extloads, so doing one requires using a buffer_load. In cases where we
813 // still couldn't use a scalar load, using the wider load shouldn't really
814 // hurt anything.
815
816 // If the old size already had to be an extload, there's no harm in continuing
817 // to reduce the width.
818 return (OldSize < 32);
819}
820
822 const SelectionDAG &DAG,
823 const MachineMemOperand &MMO) const {
824
825 assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());
826
827 if (LoadTy.getScalarType() == MVT::i32)
828 return false;
829
830 unsigned LScalarSize = LoadTy.getScalarSizeInBits();
831 unsigned CastScalarSize = CastTy.getScalarSizeInBits();
832
833 if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32))
834 return false;
835
836 unsigned Fast = 0;
838 CastTy, MMO, &Fast) &&
839 Fast;
840}
841
842// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
843// profitable with the expansion for 64-bit since it's generally good to
844// speculate things.
846 return true;
847}
848
850 return true;
851}
852
854 switch (N->getOpcode()) {
855 case ISD::EntryToken:
856 case ISD::TokenFactor:
857 return true;
859 unsigned IntrID = N->getConstantOperandVal(0);
861 }
862 case ISD::LOAD:
863 if (cast<LoadSDNode>(N)->getMemOperand()->getAddrSpace() ==
865 return true;
866 return false;
867 case AMDGPUISD::SETCC: // ballot-style instruction
868 return true;
869 }
870 return false;
871}
872
874 SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize,
875 NegatibleCost &Cost, unsigned Depth) const {
876
877 switch (Op.getOpcode()) {
878 case ISD::FMA:
879 case ISD::FMAD: {
880 // Negating a fma is not free if it has users without source mods.
881 if (!allUsesHaveSourceMods(Op.getNode()))
882 return SDValue();
883 break;
884 }
885 case AMDGPUISD::RCP: {
886 SDValue Src = Op.getOperand(0);
887 EVT VT = Op.getValueType();
888 SDLoc SL(Op);
889
890 SDValue NegSrc = getNegatedExpression(Src, DAG, LegalOperations,
891 ForCodeSize, Cost, Depth + 1);
892 if (NegSrc)
893 return DAG.getNode(AMDGPUISD::RCP, SL, VT, NegSrc, Op->getFlags());
894 return SDValue();
895 }
896 default:
897 break;
898 }
899
900 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
901 ForCodeSize, Cost, Depth);
902}
903
904//===---------------------------------------------------------------------===//
905// Target Properties
906//===---------------------------------------------------------------------===//
907
910
911 // Packed operations do not have a fabs modifier.
912 return VT == MVT::f32 || VT == MVT::f64 ||
913 (Subtarget->has16BitInsts() && VT == MVT::f16);
914}
915
918 // Report this based on the end legalized type.
919 VT = VT.getScalarType();
920 return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16;
921}
922
924 unsigned NumElem,
925 unsigned AS) const {
926 return true;
927}
928
930 // There are few operations which truly have vector input operands. Any vector
931 // operation is going to involve operations on each component, and a
932 // build_vector will be a copy per element, so it always makes sense to use a
933 // build_vector input in place of the extracted element to avoid a copy into a
934 // super register.
935 //
936 // We should probably only do this if all users are extracts only, but this
937 // should be the common case.
938 return true;
939}
940
942 // Truncate is just accessing a subregister.
943
944 unsigned SrcSize = Source.getSizeInBits();
945 unsigned DestSize = Dest.getSizeInBits();
946
947 return DestSize < SrcSize && DestSize % 32 == 0 ;
948}
949
951 // Truncate is just accessing a subregister.
952
953 unsigned SrcSize = Source->getScalarSizeInBits();
954 unsigned DestSize = Dest->getScalarSizeInBits();
955
956 if (DestSize== 16 && Subtarget->has16BitInsts())
957 return SrcSize >= 32;
958
959 return DestSize < SrcSize && DestSize % 32 == 0;
960}
961
963 unsigned SrcSize = Src->getScalarSizeInBits();
964 unsigned DestSize = Dest->getScalarSizeInBits();
965
966 if (SrcSize == 16 && Subtarget->has16BitInsts())
967 return DestSize >= 32;
968
969 return SrcSize == 32 && DestSize == 64;
970}
971
973 // Any register load of a 64-bit value really requires 2 32-bit moves. For all
974 // practical purposes, the extra mov 0 to load a 64-bit is free. As used,
975 // this will enable reducing 64-bit operations the 32-bit, which is always
976 // good.
977
978 if (Src == MVT::i16)
979 return Dest == MVT::i32 ||Dest == MVT::i64 ;
980
981 return Src == MVT::i32 && Dest == MVT::i64;
982}
983
985 // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
986 // limited number of native 64-bit operations. Shrinking an operation to fit
987 // in a single 32-bit register should always be helpful. As currently used,
988 // this is much less general than the name suggests, and is only used in
989 // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
990 // not profitable, and may actually be harmful.
991 return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
992}
993
995 const SDNode* N, CombineLevel Level) const {
996 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
997 N->getOpcode() == ISD::SRL) &&
998 "Expected shift op");
999 // Always commute pre-type legalization and right shifts.
1000 // We're looking for shl(or(x,y),z) patterns.
1002 N->getOpcode() != ISD::SHL || N->getOperand(0).getOpcode() != ISD::OR)
1003 return true;
1004
1005 // If only user is a i32 right-shift, then don't destroy a BFE pattern.
1006 if (N->getValueType(0) == MVT::i32 && N->use_size() == 1 &&
1007 (N->use_begin()->getOpcode() == ISD::SRA ||
1008 N->use_begin()->getOpcode() == ISD::SRL))
1009 return false;
1010
1011 // Don't destroy or(shl(load_zext(),c), load_zext()) patterns.
1012 auto IsShiftAndLoad = [](SDValue LHS, SDValue RHS) {
1013 if (LHS.getOpcode() != ISD::SHL)
1014 return false;
1015 auto *RHSLd = dyn_cast<LoadSDNode>(RHS);
1016 auto *LHS0 = dyn_cast<LoadSDNode>(LHS.getOperand(0));
1017 auto *LHS1 = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
1018 return LHS0 && LHS1 && RHSLd && LHS0->getExtensionType() == ISD::ZEXTLOAD &&
1019 LHS1->getAPIntValue() == LHS0->getMemoryVT().getScalarSizeInBits() &&
1020 RHSLd->getExtensionType() == ISD::ZEXTLOAD;
1021 };
1022 SDValue LHS = N->getOperand(0).getOperand(0);
1023 SDValue RHS = N->getOperand(0).getOperand(1);
1024 return !(IsShiftAndLoad(LHS, RHS) || IsShiftAndLoad(RHS, LHS));
1025}
1026
1027//===---------------------------------------------------------------------===//
1028// TargetLowering Callbacks
1029//===---------------------------------------------------------------------===//
1030
1032 bool IsVarArg) {
1033 switch (CC) {
1041 return CC_AMDGPU;
1044 return CC_AMDGPU_CS_CHAIN;
1045 case CallingConv::C:
1046 case CallingConv::Fast:
1047 case CallingConv::Cold:
1048 return CC_AMDGPU_Func;
1050 return CC_SI_Gfx;
1053 default:
1054 report_fatal_error("Unsupported calling convention for call");
1055 }
1056}
1057
1059 bool IsVarArg) {
1060 switch (CC) {
1063 llvm_unreachable("kernels should not be handled here");
1073 return RetCC_SI_Shader;
1075 return RetCC_SI_Gfx;
1076 case CallingConv::C:
1077 case CallingConv::Fast:
1078 case CallingConv::Cold:
1079 return RetCC_AMDGPU_Func;
1080 default:
1081 report_fatal_error("Unsupported calling convention.");
1082 }
1083}
1084
1085/// The SelectionDAGBuilder will automatically promote function arguments
1086/// with illegal types. However, this does not work for the AMDGPU targets
1087/// since the function arguments are stored in memory as these illegal types.
1088/// In order to handle this properly we need to get the original types sizes
1089/// from the LLVM IR Function and fixup the ISD:InputArg values before
1090/// passing them to AnalyzeFormalArguments()
1091
1092/// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
1093/// input values across multiple registers. Each item in the Ins array
1094/// represents a single value that will be stored in registers. Ins[x].VT is
1095/// the value type of the value that will be stored in the register, so
1096/// whatever SDNode we lower the argument to needs to be this type.
1097///
1098/// In order to correctly lower the arguments we need to know the size of each
1099/// argument. Since Ins[x].VT gives us the size of the register that will
1100/// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
1101/// for the original function argument so that we can deduce the correct memory
1102/// type to use for Ins[x]. In most cases the correct memory type will be
1103/// Ins[x].ArgVT. However, this will not always be the case. If, for example,
1104/// we have a kernel argument of type v8i8, this argument will be split into
1105/// 8 parts and each part will be represented by its own item in the Ins array.
1106/// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
1107/// the argument before it was split. From this, we deduce that the memory type
1108/// for each individual part is i8. We pass the memory type as LocVT to the
1109/// calling convention analysis function and the register type (Ins[x].VT) as
1110/// the ValVT.
1112 CCState &State,
1113 const SmallVectorImpl<ISD::InputArg> &Ins) const {
1114 const MachineFunction &MF = State.getMachineFunction();
1115 const Function &Fn = MF.getFunction();
1116 LLVMContext &Ctx = Fn.getParent()->getContext();
1117 const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF);
1118 const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset();
1120
1121 Align MaxAlign = Align(1);
1122 uint64_t ExplicitArgOffset = 0;
1123 const DataLayout &DL = Fn.getParent()->getDataLayout();
1124
1125 unsigned InIndex = 0;
1126
1127 for (const Argument &Arg : Fn.args()) {
1128 const bool IsByRef = Arg.hasByRefAttr();
1129 Type *BaseArgTy = Arg.getType();
1130 Type *MemArgTy = IsByRef ? Arg.getParamByRefType() : BaseArgTy;
1131 Align Alignment = DL.getValueOrABITypeAlignment(
1132 IsByRef ? Arg.getParamAlign() : std::nullopt, MemArgTy);
1133 MaxAlign = std::max(Alignment, MaxAlign);
1134 uint64_t AllocSize = DL.getTypeAllocSize(MemArgTy);
1135
1136 uint64_t ArgOffset = alignTo(ExplicitArgOffset, Alignment) + ExplicitOffset;
1137 ExplicitArgOffset = alignTo(ExplicitArgOffset, Alignment) + AllocSize;
1138
1139 // We're basically throwing away everything passed into us and starting over
1140 // to get accurate in-memory offsets. The "PartOffset" is completely useless
1141 // to us as computed in Ins.
1142 //
1143 // We also need to figure out what type legalization is trying to do to get
1144 // the correct memory offsets.
1145
1146 SmallVector<EVT, 16> ValueVTs;
1148 ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, &Offsets, ArgOffset);
1149
1150 for (unsigned Value = 0, NumValues = ValueVTs.size();
1151 Value != NumValues; ++Value) {
1152 uint64_t BasePartOffset = Offsets[Value];
1153
1154 EVT ArgVT = ValueVTs[Value];
1155 EVT MemVT = ArgVT;
1156 MVT RegisterVT = getRegisterTypeForCallingConv(Ctx, CC, ArgVT);
1157 unsigned NumRegs = getNumRegistersForCallingConv(Ctx, CC, ArgVT);
1158
1159 if (NumRegs == 1) {
1160 // This argument is not split, so the IR type is the memory type.
1161 if (ArgVT.isExtended()) {
1162 // We have an extended type, like i24, so we should just use the
1163 // register type.
1164 MemVT = RegisterVT;
1165 } else {
1166 MemVT = ArgVT;
1167 }
1168 } else if (ArgVT.isVector() && RegisterVT.isVector() &&
1169 ArgVT.getScalarType() == RegisterVT.getScalarType()) {
1170 assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements());
1171 // We have a vector value which has been split into a vector with
1172 // the same scalar type, but fewer elements. This should handle
1173 // all the floating-point vector types.
1174 MemVT = RegisterVT;
1175 } else if (ArgVT.isVector() &&
1176 ArgVT.getVectorNumElements() == NumRegs) {
1177 // This arg has been split so that each element is stored in a separate
1178 // register.
1179 MemVT = ArgVT.getScalarType();
1180 } else if (ArgVT.isExtended()) {
1181 // We have an extended type, like i65.
1182 MemVT = RegisterVT;
1183 } else {
1184 unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs;
1185 assert(ArgVT.getStoreSizeInBits() % NumRegs == 0);
1186 if (RegisterVT.isInteger()) {
1187 MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
1188 } else if (RegisterVT.isVector()) {
1189 assert(!RegisterVT.getScalarType().isFloatingPoint());
1190 unsigned NumElements = RegisterVT.getVectorNumElements();
1191 assert(MemoryBits % NumElements == 0);
1192 // This vector type has been split into another vector type with
1193 // a different elements size.
1194 EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
1195 MemoryBits / NumElements);
1196 MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
1197 } else {
1198 llvm_unreachable("cannot deduce memory type.");
1199 }
1200 }
1201
1202 // Convert one element vectors to scalar.
1203 if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
1204 MemVT = MemVT.getScalarType();
1205
1206 // Round up vec3/vec5 argument.
1207 if (MemVT.isVector() && !MemVT.isPow2VectorType()) {
1208 assert(MemVT.getVectorNumElements() == 3 ||
1209 MemVT.getVectorNumElements() == 5 ||
1210 (MemVT.getVectorNumElements() >= 9 &&
1211 MemVT.getVectorNumElements() <= 12));
1212 MemVT = MemVT.getPow2VectorType(State.getContext());
1213 } else if (!MemVT.isSimple() && !MemVT.isVector()) {
1214 MemVT = MemVT.getRoundIntegerType(State.getContext());
1215 }
1216
1217 unsigned PartOffset = 0;
1218 for (unsigned i = 0; i != NumRegs; ++i) {
1219 State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT,
1220 BasePartOffset + PartOffset,
1221 MemVT.getSimpleVT(),
1223 PartOffset += MemVT.getStoreSize();
1224 }
1225 }
1226 }
1227}
1228
1230 SDValue Chain, CallingConv::ID CallConv,
1231 bool isVarArg,
1233 const SmallVectorImpl<SDValue> &OutVals,
1234 const SDLoc &DL, SelectionDAG &DAG) const {
1235 // FIXME: Fails for r600 tests
1236 //assert(!isVarArg && Outs.empty() && OutVals.empty() &&
1237 // "wave terminate should not have return values");
1238 return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
1239}
1240
1241//===---------------------------------------------------------------------===//
1242// Target specific lowering
1243//===---------------------------------------------------------------------===//
1244
1245/// Selects the correct CCAssignFn for a given CallingConvention value.
1247 bool IsVarArg) {
1248 return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg);
1249}
1250
1252 bool IsVarArg) {
1254}
1255
1257 SelectionDAG &DAG,
1258 MachineFrameInfo &MFI,
1259 int ClobberedFI) const {
1260 SmallVector<SDValue, 8> ArgChains;
1261 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
1262 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
1263
1264 // Include the original chain at the beginning of the list. When this is
1265 // used by target LowerCall hooks, this helps legalize find the
1266 // CALLSEQ_BEGIN node.
1267 ArgChains.push_back(Chain);
1268
1269 // Add a chain value for each stack argument corresponding
1270 for (SDNode *U : DAG.getEntryNode().getNode()->uses()) {
1271 if (LoadSDNode *L = dyn_cast<LoadSDNode>(U)) {
1272 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) {
1273 if (FI->getIndex() < 0) {
1274 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
1275 int64_t InLastByte = InFirstByte;
1276 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
1277
1278 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
1279 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
1280 ArgChains.push_back(SDValue(L, 1));
1281 }
1282 }
1283 }
1284 }
1285
1286 // Build a tokenfactor for all the chains.
1287 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
1288}
1289
1292 StringRef Reason) const {
1293 SDValue Callee = CLI.Callee;
1294 SelectionDAG &DAG = CLI.DAG;
1295
1296 const Function &Fn = DAG.getMachineFunction().getFunction();
1297
1298 StringRef FuncName("<unknown>");
1299
1300 if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee))
1301 FuncName = G->getSymbol();
1302 else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
1303 FuncName = G->getGlobal()->getName();
1304
1306 Fn, Reason + FuncName, CLI.DL.getDebugLoc());
1307 DAG.getContext()->diagnose(NoCalls);
1308
1309 if (!CLI.IsTailCall) {
1310 for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
1311 InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT));
1312 }
1313
1314 return DAG.getEntryNode();
1315}
1316
1318 SmallVectorImpl<SDValue> &InVals) const {
1319 return lowerUnhandledCall(CLI, InVals, "unsupported call to function ");
1320}
1321
1323 SelectionDAG &DAG) const {
1324 const Function &Fn = DAG.getMachineFunction().getFunction();
1325
1326 DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "unsupported dynamic alloca",
1327 SDLoc(Op).getDebugLoc());
1328 DAG.getContext()->diagnose(NoDynamicAlloca);
1329 auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
1330 return DAG.getMergeValues(Ops, SDLoc());
1331}
1332
1334 SelectionDAG &DAG) const {
1335 switch (Op.getOpcode()) {
1336 default:
1337 Op->print(errs(), &DAG);
1338 llvm_unreachable("Custom lowering code for this "
1339 "instruction is not implemented yet!");
1340 break;
1342 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
1344 case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
1345 case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
1346 case ISD::FREM: return LowerFREM(Op, DAG);
1347 case ISD::FCEIL: return LowerFCEIL(Op, DAG);
1348 case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
1349 case ISD::FRINT: return LowerFRINT(Op, DAG);
1350 case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
1351 case ISD::FROUNDEVEN:
1352 return LowerFROUNDEVEN(Op, DAG);
1353 case ISD::FROUND: return LowerFROUND(Op, DAG);
1354 case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
1355 case ISD::FLOG2:
1356 return LowerFLOG2(Op, DAG);
1357 case ISD::FLOG:
1358 case ISD::FLOG10:
1359 return LowerFLOGCommon(Op, DAG);
1360 case ISD::FEXP:
1361 case ISD::FEXP10:
1362 return lowerFEXP(Op, DAG);
1363 case ISD::FEXP2:
1364 return lowerFEXP2(Op, DAG);
1365 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
1366 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
1367 case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
1368 case ISD::FP_TO_SINT:
1369 case ISD::FP_TO_UINT:
1370 return LowerFP_TO_INT(Op, DAG);
1371 case ISD::CTTZ:
1373 case ISD::CTLZ:
1375 return LowerCTLZ_CTTZ(Op, DAG);
1377 }
1378 return Op;
1379}
1380
1383 SelectionDAG &DAG) const {
1384 switch (N->getOpcode()) {
1386 // Different parts of legalization seem to interpret which type of
1387 // sign_extend_inreg is the one to check for custom lowering. The extended
1388 // from type is what really matters, but some places check for custom
1389 // lowering of the result type. This results in trying to use
1390 // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
1391 // nothing here and let the illegal result integer be handled normally.
1392 return;
1393 case ISD::FLOG2:
1394 if (SDValue Lowered = LowerFLOG2(SDValue(N, 0), DAG))
1395 Results.push_back(Lowered);
1396 return;
1397 case ISD::FLOG:
1398 case ISD::FLOG10:
1399 if (SDValue Lowered = LowerFLOGCommon(SDValue(N, 0), DAG))
1400 Results.push_back(Lowered);
1401 return;
1402 case ISD::FEXP2:
1403 if (SDValue Lowered = lowerFEXP2(SDValue(N, 0), DAG))
1404 Results.push_back(Lowered);
1405 return;
1406 case ISD::FEXP:
1407 case ISD::FEXP10:
1408 if (SDValue Lowered = lowerFEXP(SDValue(N, 0), DAG))
1409 Results.push_back(Lowered);
1410 return;
1411 case ISD::CTLZ:
1413 if (auto Lowered = lowerCTLZResults(SDValue(N, 0u), DAG))
1414 Results.push_back(Lowered);
1415 return;
1416 default:
1417 return;
1418 }
1419}
1420
1422 SDValue Op,
1423 SelectionDAG &DAG) const {
1424
1425 const DataLayout &DL = DAG.getDataLayout();
1426 GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
1427 const GlobalValue *GV = G->getGlobal();
1428
1429 if (!MFI->isModuleEntryFunction()) {
1430 if (std::optional<uint32_t> Address =
1432 return DAG.getConstant(*Address, SDLoc(Op), Op.getValueType());
1433 }
1434 }
1435
1436 if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
1437 G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
1438 if (!MFI->isModuleEntryFunction() &&
1439 !GV->getName().equals("llvm.amdgcn.module.lds")) {
1440 SDLoc DL(Op);
1441 const Function &Fn = DAG.getMachineFunction().getFunction();
1442 DiagnosticInfoUnsupported BadLDSDecl(
1443 Fn, "local memory global used by non-kernel function",
1444 DL.getDebugLoc(), DS_Warning);
1445 DAG.getContext()->diagnose(BadLDSDecl);
1446
1447 // We currently don't have a way to correctly allocate LDS objects that
1448 // aren't directly associated with a kernel. We do force inlining of
1449 // functions that use local objects. However, if these dead functions are
1450 // not eliminated, we don't want a compile time error. Just emit a warning
1451 // and a trap, since there should be no callable path here.
1452 SDValue Trap = DAG.getNode(ISD::TRAP, DL, MVT::Other, DAG.getEntryNode());
1453 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
1454 Trap, DAG.getRoot());
1455 DAG.setRoot(OutputChain);
1456 return DAG.getUNDEF(Op.getValueType());
1457 }
1458
1459 // XXX: What does the value of G->getOffset() mean?
1460 assert(G->getOffset() == 0 &&
1461 "Do not know what to do with an non-zero offset");
1462
1463 // TODO: We could emit code to handle the initialization somewhere.
1464 // We ignore the initializer for now and legalize it to allow selection.
1465 // The initializer will anyway get errored out during assembly emission.
1466 unsigned Offset = MFI->allocateLDSGlobal(DL, *cast<GlobalVariable>(GV));
1467 return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
1468 }
1469 return SDValue();
1470}
1471
1473 SelectionDAG &DAG) const {
1475 SDLoc SL(Op);
1476
1477 EVT VT = Op.getValueType();
1478 if (VT.getVectorElementType().getSizeInBits() < 32) {
1479 unsigned OpBitSize = Op.getOperand(0).getValueType().getSizeInBits();
1480 if (OpBitSize >= 32 && OpBitSize % 32 == 0) {
1481 unsigned NewNumElt = OpBitSize / 32;
1482 EVT NewEltVT = (NewNumElt == 1) ? MVT::i32
1484 MVT::i32, NewNumElt);
1485 for (const SDUse &U : Op->ops()) {
1486 SDValue In = U.get();
1487 SDValue NewIn = DAG.getNode(ISD::BITCAST, SL, NewEltVT, In);
1488 if (NewNumElt > 1)
1489 DAG.ExtractVectorElements(NewIn, Args);
1490 else
1491 Args.push_back(NewIn);
1492 }
1493
1494 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
1495 NewNumElt * Op.getNumOperands());
1496 SDValue BV = DAG.getBuildVector(NewVT, SL, Args);
1497 return DAG.getNode(ISD::BITCAST, SL, VT, BV);
1498 }
1499 }
1500
1501 for (const SDUse &U : Op->ops())
1502 DAG.ExtractVectorElements(U.get(), Args);
1503
1504 return DAG.getBuildVector(Op.getValueType(), SL, Args);
1505}
1506
1508 SelectionDAG &DAG) const {
1509 SDLoc SL(Op);
1511 unsigned Start = Op.getConstantOperandVal(1);
1512 EVT VT = Op.getValueType();
1513 EVT SrcVT = Op.getOperand(0).getValueType();
1514
1515 if (VT.getScalarSizeInBits() == 16 && Start % 2 == 0) {
1516 unsigned NumElt = VT.getVectorNumElements();
1517 unsigned NumSrcElt = SrcVT.getVectorNumElements();
1518 assert(NumElt % 2 == 0 && NumSrcElt % 2 == 0 && "expect legal types");
1519
1520 // Extract 32-bit registers at a time.
1521 EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumSrcElt / 2);
1522 EVT NewVT = NumElt == 2
1523 ? MVT::i32
1524 : EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElt / 2);
1525 SDValue Tmp = DAG.getNode(ISD::BITCAST, SL, NewSrcVT, Op.getOperand(0));
1526
1527 DAG.ExtractVectorElements(Tmp, Args, Start / 2, NumElt / 2);
1528 if (NumElt == 2)
1529 Tmp = Args[0];
1530 else
1531 Tmp = DAG.getBuildVector(NewVT, SL, Args);
1532
1533 return DAG.getNode(ISD::BITCAST, SL, VT, Tmp);
1534 }
1535
1536 DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
1538
1539 return DAG.getBuildVector(Op.getValueType(), SL, Args);
1540}
1541
1542// TODO: Handle fabs too
1544 if (Val.getOpcode() == ISD::FNEG)
1545 return Val.getOperand(0);
1546
1547 return Val;
1548}
1549
1551 if (Val.getOpcode() == ISD::FNEG)
1552 Val = Val.getOperand(0);
1553 if (Val.getOpcode() == ISD::FABS)
1554 Val = Val.getOperand(0);
1555 if (Val.getOpcode() == ISD::FCOPYSIGN)
1556 Val = Val.getOperand(0);
1557 return Val;
1558}
1559
1561 const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True,
1562 SDValue False, SDValue CC, DAGCombinerInfo &DCI) const {
1563 SelectionDAG &DAG = DCI.DAG;
1564 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1565 switch (CCOpcode) {
1566 case ISD::SETOEQ:
1567 case ISD::SETONE:
1568 case ISD::SETUNE:
1569 case ISD::SETNE:
1570 case ISD::SETUEQ:
1571 case ISD::SETEQ:
1572 case ISD::SETFALSE:
1573 case ISD::SETFALSE2:
1574 case ISD::SETTRUE:
1575 case ISD::SETTRUE2:
1576 case ISD::SETUO:
1577 case ISD::SETO:
1578 break;
1579 case ISD::SETULE:
1580 case ISD::SETULT: {
1581 if (LHS == True)
1582 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1583 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1584 }
1585 case ISD::SETOLE:
1586 case ISD::SETOLT:
1587 case ISD::SETLE:
1588 case ISD::SETLT: {
1589 // Ordered. Assume ordered for undefined.
1590
1591 // Only do this after legalization to avoid interfering with other combines
1592 // which might occur.
1594 !DCI.isCalledByLegalizer())
1595 return SDValue();
1596
1597 // We need to permute the operands to get the correct NaN behavior. The
1598 // selected operand is the second one based on the failing compare with NaN,
1599 // so permute it based on the compare type the hardware uses.
1600 if (LHS == True)
1601 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1602 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1603 }
1604 case ISD::SETUGE:
1605 case ISD::SETUGT: {
1606 if (LHS == True)
1607 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1608 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1609 }
1610 case ISD::SETGT:
1611 case ISD::SETGE:
1612 case ISD::SETOGE:
1613 case ISD::SETOGT: {
1615 !DCI.isCalledByLegalizer())
1616 return SDValue();
1617
1618 if (LHS == True)
1619 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1620 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1621 }
1622 case ISD::SETCC_INVALID:
1623 llvm_unreachable("Invalid setcc condcode!");
1624 }
1625 return SDValue();
1626}
1627
1628/// Generate Min/Max node
1630 SDValue LHS, SDValue RHS,
1631 SDValue True, SDValue False,
1632 SDValue CC,
1633 DAGCombinerInfo &DCI) const {
1634 if ((LHS == True && RHS == False) || (LHS == False && RHS == True))
1635 return combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, True, False, CC, DCI);
1636
1637 SelectionDAG &DAG = DCI.DAG;
1638
1639 // If we can't directly match this, try to see if we can fold an fneg to
1640 // match.
1641
1642 ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
1643 ConstantFPSDNode *CFalse = dyn_cast<ConstantFPSDNode>(False);
1644 SDValue NegTrue = peekFNeg(True);
1645
1646 // Undo the combine foldFreeOpFromSelect does if it helps us match the
1647 // fmin/fmax.
1648 //
1649 // select (fcmp olt (lhs, K)), (fneg lhs), -K
1650 // -> fneg (fmin_legacy lhs, K)
1651 //
1652 // TODO: Use getNegatedExpression
1653 if (LHS == NegTrue && CFalse && CRHS) {
1654 APFloat NegRHS = neg(CRHS->getValueAPF());
1655 if (NegRHS == CFalse->getValueAPF()) {
1656 SDValue Combined =
1657 combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, NegTrue, False, CC, DCI);
1658 if (Combined)
1659 return DAG.getNode(ISD::FNEG, DL, VT, Combined);
1660 return SDValue();
1661 }
1662 }
1663
1664 return SDValue();
1665}
1666
1667std::pair<SDValue, SDValue>
1669 SDLoc SL(Op);
1670
1671 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1672
1673 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1674 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1675
1676 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1677 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1678
1679 return std::pair(Lo, Hi);
1680}
1681
1683 SDLoc SL(Op);
1684
1685 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1686 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1687 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1688}
1689
1691 SDLoc SL(Op);
1692
1693 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1694 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1695 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1696}
1697
1698// Split a vector type into two parts. The first part is a power of two vector.
1699// The second part is whatever is left over, and is a scalar if it would
1700// otherwise be a 1-vector.
1701std::pair<EVT, EVT>
1703 EVT LoVT, HiVT;
1704 EVT EltVT = VT.getVectorElementType();
1705 unsigned NumElts = VT.getVectorNumElements();
1706 unsigned LoNumElts = PowerOf2Ceil((NumElts + 1) / 2);
1707 LoVT = EVT::getVectorVT(*DAG.getContext(), EltVT, LoNumElts);
1708 HiVT = NumElts - LoNumElts == 1
1709 ? EltVT
1710 : EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts - LoNumElts);
1711 return std::pair(LoVT, HiVT);
1712}
1713
1714// Split a vector value into two parts of types LoVT and HiVT. HiVT could be
1715// scalar.
1716std::pair<SDValue, SDValue>
1718 const EVT &LoVT, const EVT &HiVT,
1719 SelectionDAG &DAG) const {
1721 (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <=
1722 N.getValueType().getVectorNumElements() &&
1723 "More vector elements requested than available!");
1725 DAG.getVectorIdxConstant(0, DL));
1726 SDValue Hi = DAG.getNode(
1728 HiVT, N, DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), DL));
1729 return std::pair(Lo, Hi);
1730}
1731
1733 SelectionDAG &DAG) const {
1734 LoadSDNode *Load = cast<LoadSDNode>(Op);
1735 EVT VT = Op.getValueType();
1736 SDLoc SL(Op);
1737
1738
1739 // If this is a 2 element vector, we really want to scalarize and not create
1740 // weird 1 element vectors.
1741 if (VT.getVectorNumElements() == 2) {
1742 SDValue Ops[2];
1743 std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG);
1744 return DAG.getMergeValues(Ops, SL);
1745 }
1746
1747 SDValue BasePtr = Load->getBasePtr();
1748 EVT MemVT = Load->getMemoryVT();
1749
1750 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1751
1752 EVT LoVT, HiVT;
1753 EVT LoMemVT, HiMemVT;
1754 SDValue Lo, Hi;
1755
1756 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1757 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1758 std::tie(Lo, Hi) = splitVector(Op, SL, LoVT, HiVT, DAG);
1759
1760 unsigned Size = LoMemVT.getStoreSize();
1761 Align BaseAlign = Load->getAlign();
1762 Align HiAlign = commonAlignment(BaseAlign, Size);
1763
1764 SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
1765 Load->getChain(), BasePtr, SrcValue, LoMemVT,
1766 BaseAlign, Load->getMemOperand()->getFlags());
1767 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Size));
1768 SDValue HiLoad =
1769 DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(),
1770 HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()),
1771 HiMemVT, HiAlign, Load->getMemOperand()->getFlags());
1772
1773 SDValue Join;
1774 if (LoVT == HiVT) {
1775 // This is the case that the vector is power of two so was evenly split.
1776 Join = DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad);
1777 } else {
1778 Join = DAG.getNode(ISD::INSERT_SUBVECTOR, SL, VT, DAG.getUNDEF(VT), LoLoad,
1779 DAG.getVectorIdxConstant(0, SL));
1780 Join = DAG.getNode(
1782 VT, Join, HiLoad,
1784 }
1785
1786 SDValue Ops[] = {Join, DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
1787 LoLoad.getValue(1), HiLoad.getValue(1))};
1788
1789 return DAG.getMergeValues(Ops, SL);
1790}
1791
1793 SelectionDAG &DAG) const {
1794 LoadSDNode *Load = cast<LoadSDNode>(Op);
1795 EVT VT = Op.getValueType();
1796 SDValue BasePtr = Load->getBasePtr();
1797 EVT MemVT = Load->getMemoryVT();
1798 SDLoc SL(Op);
1799 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1800 Align BaseAlign = Load->getAlign();
1801 unsigned NumElements = MemVT.getVectorNumElements();
1802
1803 // Widen from vec3 to vec4 when the load is at least 8-byte aligned
1804 // or 16-byte fully dereferenceable. Otherwise, split the vector load.
1805 if (NumElements != 3 ||
1806 (BaseAlign < Align(8) &&
1807 !SrcValue.isDereferenceable(16, *DAG.getContext(), DAG.getDataLayout())))
1808 return SplitVectorLoad(Op, DAG);
1809
1810 assert(NumElements == 3);
1811
1812 EVT WideVT =
1814 EVT WideMemVT =
1816 SDValue WideLoad = DAG.getExtLoad(
1817 Load->getExtensionType(), SL, WideVT, Load->getChain(), BasePtr, SrcValue,
1818 WideMemVT, BaseAlign, Load->getMemOperand()->getFlags());
1819 return DAG.getMergeValues(
1820 {DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, VT, WideLoad,
1821 DAG.getVectorIdxConstant(0, SL)),
1822 WideLoad.getValue(1)},
1823 SL);
1824}
1825
1827 SelectionDAG &DAG) const {
1828 StoreSDNode *Store = cast<StoreSDNode>(Op);
1829 SDValue Val = Store->getValue();
1830 EVT VT = Val.getValueType();
1831
1832 // If this is a 2 element vector, we really want to scalarize and not create
1833 // weird 1 element vectors.
1834 if (VT.getVectorNumElements() == 2)
1835 return scalarizeVectorStore(Store, DAG);
1836
1837 EVT MemVT = Store->getMemoryVT();
1838 SDValue Chain = Store->getChain();
1839 SDValue BasePtr = Store->getBasePtr();
1840 SDLoc SL(Op);
1841
1842 EVT LoVT, HiVT;
1843 EVT LoMemVT, HiMemVT;
1844 SDValue Lo, Hi;
1845
1846 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1847 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1848 std::tie(Lo, Hi) = splitVector(Val, SL, LoVT, HiVT, DAG);
1849
1850 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize());
1851
1852 const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
1853 Align BaseAlign = Store->getAlign();
1854 unsigned Size = LoMemVT.getStoreSize();
1855 Align HiAlign = commonAlignment(BaseAlign, Size);
1856
1857 SDValue LoStore =
1858 DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
1859 Store->getMemOperand()->getFlags());
1860 SDValue HiStore =
1861 DAG.getTruncStore(Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size),
1862 HiMemVT, HiAlign, Store->getMemOperand()->getFlags());
1863
1864 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
1865}
1866
1867// This is a shortcut for integer division because we have fast i32<->f32
1868// conversions, and fast f32 reciprocal instructions. The fractional part of a
1869// float is enough to accurately represent up to a 24-bit signed integer.
1871 bool Sign) const {
1872 SDLoc DL(Op);
1873 EVT VT = Op.getValueType();
1874 SDValue LHS = Op.getOperand(0);
1875 SDValue RHS = Op.getOperand(1);
1876 MVT IntVT = MVT::i32;
1877 MVT FltVT = MVT::f32;
1878
1879 unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS);
1880 if (LHSSignBits < 9)
1881 return SDValue();
1882
1883 unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS);
1884 if (RHSSignBits < 9)
1885 return SDValue();
1886
1887 unsigned BitSize = VT.getSizeInBits();
1888 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1889 unsigned DivBits = BitSize - SignBits;
1890 if (Sign)
1891 ++DivBits;
1892
1895
1896 SDValue jq = DAG.getConstant(1, DL, IntVT);
1897
1898 if (Sign) {
1899 // char|short jq = ia ^ ib;
1900 jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
1901
1902 // jq = jq >> (bitsize - 2)
1903 jq = DAG.getNode(ISD::SRA, DL, VT, jq,
1904 DAG.getConstant(BitSize - 2, DL, VT));
1905
1906 // jq = jq | 0x1
1907 jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));
1908 }
1909
1910 // int ia = (int)LHS;
1911 SDValue ia = LHS;
1912
1913 // int ib, (int)RHS;
1914 SDValue ib = RHS;
1915
1916 // float fa = (float)ia;
1917 SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
1918
1919 // float fb = (float)ib;
1920 SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
1921
1922 SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
1923 fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
1924
1925 // fq = trunc(fq);
1926 fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);
1927
1928 // float fqneg = -fq;
1929 SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
1930
1932
1933 bool UseFmadFtz = false;
1934 if (Subtarget->isGCN()) {
1936 UseFmadFtz =
1938 }
1939
1940 // float fr = mad(fqneg, fb, fa);
1941 unsigned OpCode = !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
1942 : UseFmadFtz ? (unsigned)AMDGPUISD::FMAD_FTZ
1944 SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa);
1945
1946 // int iq = (int)fq;
1947 SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
1948
1949 // fr = fabs(fr);
1950 fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
1951
1952 // fb = fabs(fb);
1953 fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
1954
1955 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
1956
1957 // int cv = fr >= fb;
1958 SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
1959
1960 // jq = (cv ? jq : 0);
1961 jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));
1962
1963 // dst = iq + jq;
1964 SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
1965
1966 // Rem needs compensation, it's easier to recompute it
1967 SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
1968 Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
1969
1970 // Truncate to number of bits this divide really is.
1971 if (Sign) {
1972 SDValue InRegSize
1973 = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits));
1974 Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize);
1975 Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize);
1976 } else {
1977 SDValue TruncMask = DAG.getConstant((UINT64_C(1) << DivBits) - 1, DL, VT);
1978 Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask);
1979 Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask);
1980 }
1981
1982 return DAG.getMergeValues({ Div, Rem }, DL);
1983}
1984
1986 SelectionDAG &DAG,
1988 SDLoc DL(Op);
1989 EVT VT = Op.getValueType();
1990
1991 assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64");
1992
1993 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
1994
1995 SDValue One = DAG.getConstant(1, DL, HalfVT);
1996 SDValue Zero = DAG.getConstant(0, DL, HalfVT);
1997
1998 //HiLo split
1999 SDValue LHS_Lo, LHS_Hi;
2000 SDValue LHS = Op.getOperand(0);
2001 std::tie(LHS_Lo, LHS_Hi) = DAG.SplitScalar(LHS, DL, HalfVT, HalfVT);
2002
2003 SDValue RHS_Lo, RHS_Hi;
2004 SDValue RHS = Op.getOperand(1);
2005 std::tie(RHS_Lo, RHS_Hi) = DAG.SplitScalar(RHS, DL, HalfVT, HalfVT);
2006
2007 if (DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
2009
2010 SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2011 LHS_Lo, RHS_Lo);
2012
2013 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), Zero});
2014 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), Zero});
2015
2016 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));
2017 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));
2018 return;
2019 }
2020
2021 if (isTypeLegal(MVT::i64)) {
2022 // The algorithm here is based on ideas from "Software Integer Division",
2023 // Tom Rodeheffer, August 2008.
2024
2027
2028 // Compute denominator reciprocal.
2029 unsigned FMAD =
2030 !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
2033 : (unsigned)AMDGPUISD::FMAD_FTZ;
2034
2035 SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo);
2036 SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi);
2037 SDValue Mad1 = DAG.getNode(FMAD, DL, MVT::f32, Cvt_Hi,
2038 DAG.getConstantFP(APInt(32, 0x4f800000).bitsToFloat(), DL, MVT::f32),
2039 Cvt_Lo);
2040 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, DL, MVT::f32, Mad1);
2041 SDValue Mul1 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Rcp,
2042 DAG.getConstantFP(APInt(32, 0x5f7ffffc).bitsToFloat(), DL, MVT::f32));
2043 SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Mul1,
2044 DAG.getConstantFP(APInt(32, 0x2f800000).bitsToFloat(), DL, MVT::f32));
2045 SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, MVT::f32, Mul2);
2046 SDValue Mad2 = DAG.getNode(FMAD, DL, MVT::f32, Trunc,
2047 DAG.getConstantFP(APInt(32, 0xcf800000).bitsToFloat(), DL, MVT::f32),
2048 Mul1);
2049 SDValue Rcp_Lo = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Mad2);
2050 SDValue Rcp_Hi = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Trunc);
2051 SDValue Rcp64 = DAG.getBitcast(VT,
2052 DAG.getBuildVector(MVT::v2i32, DL, {Rcp_Lo, Rcp_Hi}));
2053
2054 SDValue Zero64 = DAG.getConstant(0, DL, VT);
2055 SDValue One64 = DAG.getConstant(1, DL, VT);
2056 SDValue Zero1 = DAG.getConstant(0, DL, MVT::i1);
2057 SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1);
2058
2059 // First round of UNR (Unsigned integer Newton-Raphson).
2060 SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS);
2061 SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64);
2062 SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1);
2063 SDValue Mulhi1_Lo, Mulhi1_Hi;
2064 std::tie(Mulhi1_Lo, Mulhi1_Hi) =
2065 DAG.SplitScalar(Mulhi1, DL, HalfVT, HalfVT);
2066 SDValue Add1_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Lo,
2067 Mulhi1_Lo, Zero1);
2068 SDValue Add1_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Hi,
2069 Mulhi1_Hi, Add1_Lo.getValue(1));
2070 SDValue Add1 = DAG.getBitcast(VT,
2071 DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi}));
2072
2073 // Second round of UNR.
2074 SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1);
2075 SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2);
2076 SDValue Mulhi2_Lo, Mulhi2_Hi;
2077 std::tie(Mulhi2_Lo, Mulhi2_Hi) =
2078 DAG.SplitScalar(Mulhi2, DL, HalfVT, HalfVT);
2079 SDValue Add2_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Lo,
2080 Mulhi2_Lo, Zero1);
2081 SDValue Add2_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Hi,
2082 Mulhi2_Hi, Add2_Lo.getValue(1));
2083 SDValue Add2 = DAG.getBitcast(VT,
2084 DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi}));
2085
2086 SDValue Mulhi3 = DAG.getNode(ISD::MULHU, DL, VT, LHS, Add2);
2087
2088 SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3);
2089
2090 SDValue Mul3_Lo, Mul3_Hi;
2091 std::tie(Mul3_Lo, Mul3_Hi) = DAG.SplitScalar(Mul3, DL, HalfVT, HalfVT);
2092 SDValue Sub1_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Lo,
2093 Mul3_Lo, Zero1);
2094 SDValue Sub1_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Hi,
2095 Mul3_Hi, Sub1_Lo.getValue(1));
2096 SDValue Sub1_Mi = DAG.getNode(ISD::SUB, DL, HalfVT, LHS_Hi, Mul3_Hi);
2097 SDValue Sub1 = DAG.getBitcast(VT,
2098 DAG.getBuildVector(MVT::v2i32, DL, {Sub1_Lo, Sub1_Hi}));
2099
2100 SDValue MinusOne = DAG.getConstant(0xffffffffu, DL, HalfVT);
2101 SDValue C1 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, MinusOne, Zero,
2102 ISD::SETUGE);
2103 SDValue C2 = DAG.getSelectCC(DL, Sub1_Lo, RHS_Lo, MinusOne, Zero,
2104 ISD::SETUGE);
2105 SDValue C3 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, C2, C1, ISD::SETEQ);
2106
2107 // TODO: Here and below portions of the code can be enclosed into if/endif.
2108 // Currently control flow is unconditional and we have 4 selects after
2109 // potential endif to substitute PHIs.
2110
2111 // if C3 != 0 ...
2112 SDValue Sub2_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Lo,
2113 RHS_Lo, Zero1);
2114 SDValue Sub2_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Mi,
2115 RHS_Hi, Sub1_Lo.getValue(1));
2116 SDValue Sub2_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi,
2117 Zero, Sub2_Lo.getValue(1));
2118 SDValue Sub2 = DAG.getBitcast(VT,
2119 DAG.getBuildVector(MVT::v2i32, DL, {Sub2_Lo, Sub2_Hi}));
2120
2121 SDValue Add3 = DAG.getNode(ISD::ADD, DL, VT, Mulhi3, One64);
2122
2123 SDValue C4 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, MinusOne, Zero,
2124 ISD::SETUGE);
2125 SDValue C5 = DAG.getSelectCC(DL, Sub2_Lo, RHS_Lo, MinusOne, Zero,
2126 ISD::SETUGE);
2127 SDValue C6 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, C5, C4, ISD::SETEQ);
2128
2129 // if (C6 != 0)
2130 SDValue Add4 = DAG.getNode(ISD::ADD, DL, VT, Add3, One64);
2131
2132 SDValue Sub3_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Lo,
2133 RHS_Lo, Zero1);
2134 SDValue Sub3_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi,
2135 RHS_Hi, Sub2_Lo.getValue(1));
2136 SDValue Sub3_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub3_Mi,
2137 Zero, Sub3_Lo.getValue(1));
2138 SDValue Sub3 = DAG.getBitcast(VT,
2139 DAG.getBuildVector(MVT::v2i32, DL, {Sub3_Lo, Sub3_Hi}));
2140
2141 // endif C6
2142 // endif C3
2143
2144 SDValue Sel1 = DAG.getSelectCC(DL, C6, Zero, Add4, Add3, ISD::SETNE);
2145 SDValue Div = DAG.getSelectCC(DL, C3, Zero, Sel1, Mulhi3, ISD::SETNE);
2146
2147 SDValue Sel2 = DAG.getSelectCC(DL, C6, Zero, Sub3, Sub2, ISD::SETNE);
2148 SDValue Rem = DAG.getSelectCC(DL, C3, Zero, Sel2, Sub1, ISD::SETNE);
2149
2150 Results.push_back(Div);
2151 Results.push_back(Rem);
2152
2153 return;
2154 }
2155
2156 // r600 expandion.
2157 // Get Speculative values
2158 SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
2159 SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
2160
2161 SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, Zero, REM_Part, LHS_Hi, ISD::SETEQ);
2162 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, Zero});
2163 REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);
2164
2165 SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, Zero, DIV_Part, Zero, ISD::SETEQ);
2166 SDValue DIV_Lo = Zero;
2167
2168 const unsigned halfBitWidth = HalfVT.getSizeInBits();
2169
2170 for (unsigned i = 0; i < halfBitWidth; ++i) {
2171 const unsigned bitPos = halfBitWidth - i - 1;
2172 SDValue POS = DAG.getConstant(bitPos, DL, HalfVT);
2173 // Get value of high bit
2174 SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
2175 HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, One);
2176 HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
2177
2178 // Shift
2179 REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT));
2180 // Add LHS high bit
2181 REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
2182
2183 SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT);
2184 SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, Zero, ISD::SETUGE);
2185
2186 DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
2187
2188 // Update REM
2189 SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
2190 REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
2191 }
2192
2193 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});
2194 DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);
2195 Results.push_back(DIV);
2196 Results.push_back(REM);
2197}
2198
2200 SelectionDAG &DAG) const {
2201 SDLoc DL(Op);
2202 EVT VT = Op.getValueType();
2203
2204 if (VT == MVT::i64) {
2206 LowerUDIVREM64(Op, DAG, Results);
2207 return DAG.getMergeValues(Results, DL);
2208 }
2209
2210 if (VT == MVT::i32) {
2211 if (SDValue Res = LowerDIVREM24(Op, DAG, false))
2212 return Res;
2213 }
2214
2215 SDValue X = Op.getOperand(0);
2216 SDValue Y = Op.getOperand(1);
2217
2218 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2219 // algorithm used here.
2220
2221 // Initial estimate of inv(y).
2222 SDValue Z = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Y);
2223
2224 // One round of UNR.
2225 SDValue NegY = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Y);
2226 SDValue NegYZ = DAG.getNode(ISD::MUL, DL, VT, NegY, Z);
2227 Z = DAG.getNode(ISD::ADD, DL, VT, Z,
2228 DAG.getNode(ISD::MULHU, DL, VT, Z, NegYZ));
2229
2230 // Quotient/remainder estimate.
2231 SDValue Q = DAG.getNode(ISD::MULHU, DL, VT, X, Z);
2232 SDValue R =
2233 DAG.getNode(ISD::SUB, DL, VT, X, DAG.getNode(ISD::MUL, DL, VT, Q, Y));
2234
2235 // First quotient/remainder refinement.
2236 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2237 SDValue One = DAG.getConstant(1, DL, VT);
2238 SDValue Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2239 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2240 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2241 R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2242 DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2243
2244 // Second quotient/remainder refinement.
2245 Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2246 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2247 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2248 R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2249 DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2250
2251 return DAG.getMergeValues({Q, R}, DL);
2252}
2253
2255 SelectionDAG &DAG) const {
2256 SDLoc DL(Op);
2257 EVT VT = Op.getValueType();
2258
2259 SDValue LHS = Op.getOperand(0);
2260 SDValue RHS = Op.getOperand(1);
2261
2262 SDValue Zero = DAG.getConstant(0, DL, VT);
2263 SDValue NegOne = DAG.getConstant(-1, DL, VT);
2264
2265 if (VT == MVT::i32) {
2266 if (SDValue Res = LowerDIVREM24(Op, DAG, true))
2267 return Res;
2268 }
2269
2270 if (VT == MVT::i64 &&
2271 DAG.ComputeNumSignBits(LHS) > 32 &&
2272 DAG.ComputeNumSignBits(RHS) > 32) {
2273 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
2274
2275 //HiLo split
2276 SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
2277 SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
2278 SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2279 LHS_Lo, RHS_Lo);
2280 SDValue Res[2] = {
2281 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
2282 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
2283 };
2284 return DAG.getMergeValues(Res, DL);
2285 }
2286
2287 SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
2288 SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
2289 SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
2290 SDValue RSign = LHSign; // Remainder sign is the same as LHS
2291
2292 LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
2293 RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
2294
2295 LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
2296 RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
2297
2298 SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
2299 SDValue Rem = Div.getValue(1);
2300
2301 Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
2302 Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
2303
2304 Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
2305 Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
2306
2307 SDValue Res[2] = {
2308 Div,
2309 Rem
2310 };
2311 return DAG.getMergeValues(Res, DL);
2312}
2313
2314// (frem x, y) -> (fma (fneg (ftrunc (fdiv x, y))), y, x)
2316 SDLoc SL(Op);
2317 EVT VT = Op.getValueType();
2318 auto Flags = Op->getFlags();
2319 SDValue X = Op.getOperand(0);
2320 SDValue Y = Op.getOperand(1);
2321
2322 SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y, Flags);
2323 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, VT, Div, Flags);
2324 SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Trunc, Flags);
2325 // TODO: For f32 use FMAD instead if !hasFastFMA32?
2326 return DAG.getNode(ISD::FMA, SL, VT, Neg, Y, X, Flags);
2327}
2328
2330 SDLoc SL(Op);
2331 SDValue Src = Op.getOperand(0);
2332
2333 // result = trunc(src)
2334 // if (src > 0.0 && src != result)
2335 // result += 1.0
2336
2337 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2338
2339 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2340 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
2341
2342 EVT SetCCVT =
2343 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2344
2345 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
2346 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2347 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2348
2349 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
2350 // TODO: Should this propagate fast-math-flags?
2351 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2352}
2353
2355 SelectionDAG &DAG) {
2356 const unsigned FractBits = 52;
2357 const unsigned ExpBits = 11;
2358
2359 SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
2360 Hi,
2361 DAG.getConstant(FractBits - 32, SL, MVT::i32),
2362 DAG.getConstant(ExpBits, SL, MVT::i32));
2363 SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
2364 DAG.getConstant(1023, SL, MVT::i32));
2365
2366 return Exp;
2367}
2368
2370 SDLoc SL(Op);
2371 SDValue Src = Op.getOperand(0);
2372
2373 assert(Op.getValueType() == MVT::f64);
2374
2375 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2376
2377 // Extract the upper half, since this is where we will find the sign and
2378 // exponent.
2379 SDValue Hi = getHiHalf64(Src, DAG);
2380
2381 SDValue Exp = extractF64Exponent(Hi, SL, DAG);
2382
2383 const unsigned FractBits = 52;
2384
2385 // Extract the sign bit.
2386 const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32);
2387 SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
2388
2389 // Extend back to 64-bits.
2390 SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
2391 SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
2392
2393 SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
2394 const SDValue FractMask
2395 = DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64);
2396
2397 SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
2398 SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
2399 SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
2400
2401 EVT SetCCVT =
2402 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
2403
2404 const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);
2405
2406 SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
2407 SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
2408
2409 SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
2410 SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
2411
2412 return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
2413}
2414
2416 SelectionDAG &DAG) const {
2417 SDLoc SL(Op);
2418 SDValue Src = Op.getOperand(0);
2419
2420 assert(Op.getValueType() == MVT::f64);
2421
2422 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2423 SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
2424 SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
2425
2426 // TODO: Should this propagate fast-math-flags?
2427
2428 SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
2429 SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
2430
2431 SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
2432
2433 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2434 SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
2435
2436 EVT SetCCVT =
2437 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2438 SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
2439
2440 return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
2441}
2442
2444 SelectionDAG &DAG) const {
2445 // FNEARBYINT and FRINT are the same, except in their handling of FP
2446 // exceptions. Those aren't really meaningful for us, and OpenCL only has
2447 // rint, so just treat them as equivalent.
2448 return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), Op.getValueType(),
2449 Op.getOperand(0));
2450}
2451
2453 auto VT = Op.getValueType();
2454 auto Arg = Op.getOperand(0u);
2455 return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), VT, Arg);
2456}
2457
2458// XXX - May require not supporting f32 denormals?
2459
2460// Don't handle v2f16. The extra instructions to scalarize and repack around the
2461// compare and vselect end up producing worse code than scalarizing the whole
2462// operation.
2464 SDLoc SL(Op);
2465 SDValue X = Op.getOperand(0);
2466 EVT VT = Op.getValueType();
2467
2468 SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X);
2469
2470 // TODO: Should this propagate fast-math-flags?
2471
2472 SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T);
2473
2474 SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff);
2475
2476 const SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2477 const SDValue One = DAG.getConstantFP(1.0, SL, VT);
2478
2479 EVT SetCCVT =
2480 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2481
2482 const SDValue Half = DAG.getConstantFP(0.5, SL, VT);
2483 SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
2484 SDValue OneOrZeroFP = DAG.getNode(ISD::SELECT, SL, VT, Cmp, One, Zero);
2485
2486 SDValue SignedOffset = DAG.getNode(ISD::FCOPYSIGN, SL, VT, OneOrZeroFP, X);
2487 return DAG.getNode(ISD::FADD, SL, VT, T, SignedOffset);
2488}
2489
2491 SDLoc SL(Op);
2492 SDValue Src = Op.getOperand(0);
2493
2494 // result = trunc(src);
2495 // if (src < 0.0 && src != result)
2496 // result += -1.0.
2497
2498 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2499
2500 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2501 const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);
2502
2503 EVT SetCCVT =
2504 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2505
2506 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
2507 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2508 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2509
2510 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
2511 // TODO: Should this propagate fast-math-flags?
2512 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2513}
2514
2515/// Return true if it's known that \p Src can never be an f32 denormal value.
2517 switch (Src.getOpcode()) {
2518 case ISD::FP_EXTEND:
2519 return Src.getOperand(0).getValueType() == MVT::f16;
2520 case ISD::FP16_TO_FP:
2521 case ISD::FFREXP:
2522 return true;
2524 unsigned IntrinsicID = Src.getConstantOperandVal(0);
2525 switch (IntrinsicID) {
2526 case Intrinsic::amdgcn_frexp_mant:
2527 return true;
2528 default:
2529 return false;
2530 }
2531 }
2532 default:
2533 return false;
2534 }
2535
2536 llvm_unreachable("covered opcode switch");
2537}
2538
2540 SDNodeFlags Flags) {
2541 if (Flags.hasApproximateFuncs())
2542 return true;
2543 auto &Options = DAG.getTarget().Options;
2544 return Options.UnsafeFPMath || Options.ApproxFuncFPMath;
2545}
2546
2548 SDValue Src,
2549 SDNodeFlags Flags) {
2550 return !valueIsKnownNeverF32Denorm(Src) &&
2551 DAG.getMachineFunction()
2554}
2555
2557 SDValue Src,
2558 SDNodeFlags Flags) const {
2559 SDLoc SL(Src);
2560 EVT VT = Src.getValueType();
2562 SDValue SmallestNormal =
2563 DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT);
2564
2565 // Want to scale denormals up, but negatives and 0 work just as well on the
2566 // scaled path.
2567 SDValue IsLtSmallestNormal = DAG.getSetCC(
2568 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src,
2569 SmallestNormal, ISD::SETOLT);
2570
2571 return IsLtSmallestNormal;
2572}
2573
2575 SDNodeFlags Flags) const {
2576 SDLoc SL(Src);
2577 EVT VT = Src.getValueType();
2579 SDValue Inf = DAG.getConstantFP(APFloat::getInf(Semantics), SL, VT);
2580
2581 SDValue Fabs = DAG.getNode(ISD::FABS, SL, VT, Src, Flags);
2582 SDValue IsFinite = DAG.getSetCC(
2583 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Fabs,
2584 Inf, ISD::SETOLT);
2585 return IsFinite;
2586}
2587
2588/// If denormal handling is required return the scaled input to FLOG2, and the
2589/// check for denormal range. Otherwise, return null values.
2590std::pair<SDValue, SDValue>
2592 SDValue Src, SDNodeFlags Flags) const {
2593 if (!needsDenormHandlingF32(DAG, Src, Flags))
2594 return {};
2595
2596 MVT VT = MVT::f32;
2597 const fltSemantics &Semantics = APFloat::IEEEsingle();
2598 SDValue SmallestNormal =
2599 DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT);
2600
2601 SDValue IsLtSmallestNormal = DAG.getSetCC(
2602 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src,
2603 SmallestNormal, ISD::SETOLT);
2604
2605 SDValue Scale32 = DAG.getConstantFP(0x1.0p+32, SL, VT);
2606 SDValue One = DAG.getConstantFP(1.0, SL, VT);
2607 SDValue ScaleFactor =
2608 DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, Scale32, One, Flags);
2609
2610 SDValue ScaledInput = DAG.getNode(ISD::FMUL, SL, VT, Src, ScaleFactor, Flags);
2611 return {ScaledInput, IsLtSmallestNormal};
2612}
2613
2615 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
2616 // If we have to handle denormals, scale up the input and adjust the result.
2617
2618 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
2619 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
2620
2621 SDLoc SL(Op);
2622 EVT VT = Op.getValueType();
2623 SDValue Src = Op.getOperand(0);
2624 SDNodeFlags Flags = Op->getFlags();
2625
2626 if (VT == MVT::f16) {
2627 // Nothing in half is a denormal when promoted to f32.
2628 assert(!Subtarget->has16BitInsts());
2629 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);
2630 SDValue Log = DAG.getNode(AMDGPUISD::LOG, SL, MVT::f32, Ext, Flags);
2631 return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,
2632 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
2633 }
2634
2635 auto [ScaledInput, IsLtSmallestNormal] =
2636 getScaledLogInput(DAG, SL, Src, Flags);
2637 if (!ScaledInput)
2638 return DAG.getNode(AMDGPUISD::LOG, SL, VT, Src, Flags);
2639
2640 SDValue Log2 = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags);
2641
2642 SDValue ThirtyTwo = DAG.getConstantFP(32.0, SL, VT);
2643 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2644 SDValue ResultOffset =
2645 DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, ThirtyTwo, Zero);
2646 return DAG.getNode(ISD::FSUB, SL, VT, Log2, ResultOffset, Flags);
2647}
2648
2649static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X,
2650 SDValue Y, SDValue C, SDNodeFlags Flags = SDNodeFlags()) {
2651 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Y, Flags);
2652 return DAG.getNode(ISD::FADD, SL, VT, Mul, C, Flags);
2653}
2654
2656 SelectionDAG &DAG) const {
2657 SDValue X = Op.getOperand(0);
2658 EVT VT = Op.getValueType();
2659 SDNodeFlags Flags = Op->getFlags();
2660 SDLoc DL(Op);
2661
2662 const bool IsLog10 = Op.getOpcode() == ISD::FLOG10;
2663 assert(IsLog10 || Op.getOpcode() == ISD::FLOG);
2664
2665 const auto &Options = getTargetMachine().Options;
2666 if (VT == MVT::f16 || Flags.hasApproximateFuncs() ||
2667 Options.ApproxFuncFPMath || Options.UnsafeFPMath) {
2668
2669 if (VT == MVT::f16 && !Subtarget->has16BitInsts()) {
2670 // Log and multiply in f32 is good enough for f16.
2671 X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X, Flags);
2672 }
2673
2674 SDValue Lowered = LowerFLOGUnsafe(X, DL, DAG, IsLog10, Flags);
2675 if (VT == MVT::f16 && !Subtarget->has16BitInsts()) {
2676 return DAG.getNode(ISD::FP_ROUND, DL, VT, Lowered,
2677 DAG.getTargetConstant(0, DL, MVT::i32), Flags);
2678 }
2679
2680 return Lowered;
2681 }
2682
2683 auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, DL, X, Flags);
2684 if (ScaledInput)
2685 X = ScaledInput;
2686
2687 SDValue Y = DAG.getNode(AMDGPUISD::LOG, DL, VT, X, Flags);
2688
2689 SDValue R;
2690 if (Subtarget->hasFastFMAF32()) {
2691 // c+cc are ln(2)/ln(10) to more than 49 bits
2692 const float c_log10 = 0x1.344134p-2f;
2693 const float cc_log10 = 0x1.09f79ep-26f;
2694
2695 // c + cc is ln(2) to more than 49 bits
2696 const float c_log = 0x1.62e42ep-1f;
2697 const float cc_log = 0x1.efa39ep-25f;
2698
2699 SDValue C = DAG.getConstantFP(IsLog10 ? c_log10 : c_log, DL, VT);
2700 SDValue CC = DAG.getConstantFP(IsLog10 ? cc_log10 : cc_log, DL, VT);
2701
2702 R = DAG.getNode(ISD::FMUL, DL, VT, Y, C, Flags);
2703 SDValue NegR = DAG.getNode(ISD::FNEG, DL, VT, R, Flags);
2704 SDValue FMA0 = DAG.getNode(ISD::FMA, DL, VT, Y, C, NegR, Flags);
2705 SDValue FMA1 = DAG.getNode(ISD::FMA, DL, VT, Y, CC, FMA0, Flags);
2706 R = DAG.getNode(ISD::FADD, DL, VT, R, FMA1, Flags);
2707 } else {
2708 // ch+ct is ln(2)/ln(10) to more than 36 bits
2709 const float ch_log10 = 0x1.344000p-2f;
2710 const float ct_log10 = 0x1.3509f6p-18f;
2711
2712 // ch + ct is ln(2) to more than 36 bits
2713 const float ch_log = 0x1.62e000p-1f;
2714 const float ct_log = 0x1.0bfbe8p-15f;
2715
2716 SDValue CH = DAG.getConstantFP(IsLog10 ? ch_log10 : ch_log, DL, VT);
2717 SDValue CT = DAG.getConstantFP(IsLog10 ? ct_log10 : ct_log, DL, VT);
2718
2719 SDValue YAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Y);
2720 SDValue MaskConst = DAG.getConstant(0xfffff000, DL, MVT::i32);
2721 SDValue YHInt = DAG.getNode(ISD::AND, DL, MVT::i32, YAsInt, MaskConst);
2722 SDValue YH = DAG.getNode(ISD::BITCAST, DL, MVT::f32, YHInt);
2723 SDValue YT = DAG.getNode(ISD::FSUB, DL, VT, Y, YH, Flags);
2724
2725 SDValue YTCT = DAG.getNode(ISD::FMUL, DL, VT, YT, CT, Flags);
2726 SDValue Mad0 = getMad(DAG, DL, VT, YH, CT, YTCT, Flags);
2727 SDValue Mad1 = getMad(DAG, DL, VT, YT, CH, Mad0, Flags);
2728 R = getMad(DAG, DL, VT, YH, CH, Mad1);
2729 }
2730
2731 const bool IsFiniteOnly = (Flags.hasNoNaNs() || Options.NoNaNsFPMath) &&
2732 (Flags.hasNoInfs() || Options.NoInfsFPMath);
2733
2734 // TODO: Check if known finite from source value.
2735 if (!IsFiniteOnly) {
2736 SDValue IsFinite = getIsFinite(DAG, Y, Flags);
2737 R = DAG.getNode(ISD::SELECT, DL, VT, IsFinite, R, Y, Flags);
2738 }
2739
2740 if (IsScaled) {
2741 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
2742 SDValue ShiftK =
2743 DAG.getConstantFP(IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f, DL, VT);
2744 SDValue Shift =
2745 DAG.getNode(ISD::SELECT, DL, VT, IsScaled, ShiftK, Zero, Flags);
2746 R = DAG.getNode(ISD::FSUB, DL, VT, R, Shift, Flags);
2747 }
2748
2749 return R;
2750}
2751
2753 return LowerFLOGCommon(Op, DAG);
2754}
2755
2756// Do f32 fast math expansion for flog2 or flog10. This is accurate enough for a
2757// promote f16 operation.
2759 SelectionDAG &DAG, bool IsLog10,
2760 SDNodeFlags Flags) const {
2761 EVT VT = Src.getValueType();
2762 unsigned LogOp =
2763 VT == MVT::f32 ? (unsigned)AMDGPUISD::LOG : (unsigned)ISD::FLOG2;
2764
2765 double Log2BaseInverted =
2767
2768 if (VT == MVT::f32) {
2769 auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, SL, Src, Flags);
2770 if (ScaledInput) {
2771 SDValue LogSrc = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags);
2772 SDValue ScaledResultOffset =
2773 DAG.getConstantFP(-32.0 * Log2BaseInverted, SL, VT);
2774
2775 SDValue Zero = DAG.getConstantFP(0.0f, SL, VT);
2776
2777 SDValue ResultOffset = DAG.getNode(ISD::SELECT, SL, VT, IsScaled,
2778 ScaledResultOffset, Zero, Flags);
2779
2780 SDValue Log2Inv = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2781
2782 if (Subtarget->hasFastFMAF32())
2783 return DAG.getNode(ISD::FMA, SL, VT, LogSrc, Log2Inv, ResultOffset,
2784 Flags);
2785 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, LogSrc, Log2Inv, Flags);
2786 return DAG.getNode(ISD::FADD, SL, VT, Mul, ResultOffset);
2787 }
2788 }
2789
2790 SDValue Log2Operand = DAG.getNode(LogOp, SL, VT, Src, Flags);
2791 SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2792
2793 return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand,
2794 Flags);
2795}
2796
2798 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
2799 // If we have to handle denormals, scale up the input and adjust the result.
2800
2801 SDLoc SL(Op);
2802 EVT VT = Op.getValueType();
2803 SDValue Src = Op.getOperand(0);
2804 SDNodeFlags Flags = Op->getFlags();
2805
2806 if (VT == MVT::f16) {
2807 // Nothing in half is a denormal when promoted to f32.
2808 assert(!Subtarget->has16BitInsts());
2809 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);
2810 SDValue Log = DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Ext, Flags);
2811 return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,
2812 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
2813 }
2814
2815 assert(VT == MVT::f32);
2816
2817 if (!needsDenormHandlingF32(DAG, Src, Flags))
2818 return DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Src, Flags);
2819
2820 // bool needs_scaling = x < -0x1.f80000p+6f;
2821 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
2822
2823 // -nextafter(128.0, -1)
2824 SDValue RangeCheckConst = DAG.getConstantFP(-0x1.f80000p+6f, SL, VT);
2825
2826 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2827
2828 SDValue NeedsScaling =
2829 DAG.getSetCC(SL, SetCCVT, Src, RangeCheckConst, ISD::SETOLT);
2830
2831 SDValue SixtyFour = DAG.getConstantFP(0x1.0p+6f, SL, VT);
2832 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2833
2834 SDValue AddOffset =
2835 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, SixtyFour, Zero);
2836
2837 SDValue AddInput = DAG.getNode(ISD::FADD, SL, VT, Src, AddOffset, Flags);
2838 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, AddInput, Flags);
2839
2840 SDValue TwoExpNeg64 = DAG.getConstantFP(0x1.0p-64f, SL, VT);
2841 SDValue One = DAG.getConstantFP(1.0, SL, VT);
2842 SDValue ResultScale =
2843 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, TwoExpNeg64, One);
2844
2845 return DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScale, Flags);
2846}
2847
2849 SelectionDAG &DAG,
2850 SDNodeFlags Flags) const {
2851 EVT VT = X.getValueType();
2852 const SDValue Log2E = DAG.getConstantFP(numbers::log2e, SL, VT);
2853
2854 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) {
2855 // exp2(M_LOG2E_F * f);
2856 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Log2E, Flags);
2857 return DAG.getNode(VT == MVT::f32 ? (unsigned)AMDGPUISD::EXP
2858 : (unsigned)ISD::FEXP2,
2859 SL, VT, Mul, Flags);
2860 }
2861
2862 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2863
2864 SDValue Threshold = DAG.getConstantFP(-0x1.5d58a0p+6f, SL, VT);
2865 SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT);
2866
2867 SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+6f, SL, VT);
2868
2869 SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags);
2870
2871 SDValue AdjustedX =
2872 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X);
2873
2874 SDValue ExpInput = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, Log2E, Flags);
2875
2876 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, ExpInput, Flags);
2877
2878 SDValue ResultScaleFactor = DAG.getConstantFP(0x1.969d48p-93f, SL, VT);
2879 SDValue AdjustedResult =
2880 DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScaleFactor, Flags);
2881
2882 return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, Exp2,
2883 Flags);
2884}
2885
2886/// Emit approx-funcs appropriate lowering for exp10. inf/nan should still be
2887/// handled correctly.
2889 SelectionDAG &DAG,
2890 SDNodeFlags Flags) const {
2891 const EVT VT = X.getValueType();
2892 const unsigned Exp2Op = VT == MVT::f32 ? AMDGPUISD::EXP : ISD::FEXP2;
2893
2894 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) {
2895 // exp2(x * 0x1.a92000p+1f) * exp2(x * 0x1.4f0978p-11f);
2896 SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT);
2897 SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT);
2898
2899 SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, X, K0, Flags);
2900 SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags);
2901 SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, X, K1, Flags);
2902 SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags);
2903 return DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1);
2904 }
2905
2906 // bool s = x < -0x1.2f7030p+5f;
2907 // x += s ? 0x1.0p+5f : 0.0f;
2908 // exp10 = exp2(x * 0x1.a92000p+1f) *
2909 // exp2(x * 0x1.4f0978p-11f) *
2910 // (s ? 0x1.9f623ep-107f : 1.0f);
2911
2912 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2913
2914 SDValue Threshold = DAG.getConstantFP(-0x1.2f7030p+5f, SL, VT);
2915 SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT);
2916
2917 SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+5f, SL, VT);
2918 SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags);
2919 SDValue AdjustedX =
2920 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X);
2921
2922 SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT);
2923 SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT);
2924
2925 SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K0, Flags);
2926 SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags);
2927 SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K1, Flags);
2928 SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags);
2929
2930 SDValue MulExps = DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1, Flags);
2931
2932 SDValue ResultScaleFactor = DAG.getConstantFP(0x1.9f623ep-107f, SL, VT);
2933 SDValue AdjustedResult =
2934 DAG.getNode(ISD::FMUL, SL, VT, MulExps, ResultScaleFactor, Flags);
2935
2936 return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, MulExps,
2937 Flags);
2938}
2939
2941 EVT VT = Op.getValueType();
2942 SDLoc SL(Op);
2943 SDValue X = Op.getOperand(0);
2944 SDNodeFlags Flags = Op->getFlags();
2945 const bool IsExp10 = Op.getOpcode() == ISD::FEXP10;
2946
2947 if (VT.getScalarType() == MVT::f16) {
2948 // v_exp_f16 (fmul x, log2e)
2949 if (allowApproxFunc(DAG, Flags)) // TODO: Does this really require fast?
2950 return lowerFEXPUnsafe(X, SL, DAG, Flags);
2951
2952 if (VT.isVector())
2953 return SDValue();
2954
2955 // exp(f16 x) ->
2956 // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
2957
2958 // Nothing in half is a denormal when promoted to f32.
2959 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, X, Flags);
2960 SDValue Lowered = lowerFEXPUnsafe(Ext, SL, DAG, Flags);
2961 return DAG.getNode(ISD::FP_ROUND, SL, VT, Lowered,
2962 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
2963 }
2964
2965 assert(VT == MVT::f32);
2966
2967 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
2968 // library behavior. Also, is known-not-daz source sufficient?
2969 if (allowApproxFunc(DAG, Flags)) {
2970 return IsExp10 ? lowerFEXP10Unsafe(X, SL, DAG, Flags)
2971 : lowerFEXPUnsafe(X, SL, DAG, Flags);
2972 }
2973
2974 // Algorithm:
2975 //
2976 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
2977 //
2978 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
2979 // n = 64*m + j, 0 <= j < 64
2980 //
2981 // e^x = 2^((64*m + j + f)/64)
2982 // = (2^m) * (2^(j/64)) * 2^(f/64)
2983 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
2984 //
2985 // f = x*(64/ln(2)) - n
2986 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
2987 //
2988 // e^x = (2^m) * (2^(j/64)) * e^r
2989 //
2990 // (2^(j/64)) is precomputed
2991 //
2992 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
2993 // e^r = 1 + q
2994 //
2995 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
2996 //
2997 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
2998 SDNodeFlags FlagsNoContract = Flags;
2999 FlagsNoContract.setAllowContract(false);
3000
3001 SDValue PH, PL;
3002 if (Subtarget->hasFastFMAF32()) {
3003 const float c_exp = numbers::log2ef;
3004 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
3005 const float c_exp10 = 0x1.a934f0p+1f;
3006 const float cc_exp10 = 0x1.2f346ep-24f;
3007
3008 SDValue C = DAG.getConstantFP(IsExp10 ? c_exp10 : c_exp, SL, VT);
3009 SDValue CC = DAG.getConstantFP(IsExp10 ? cc_exp10 : cc_exp, SL, VT);
3010
3011 PH = DAG.getNode(ISD::FMUL, SL, VT, X, C, Flags);
3012 SDValue NegPH = DAG.getNode(ISD::FNEG, SL, VT, PH, Flags);
3013 SDValue FMA0 = DAG.getNode(ISD::FMA, SL, VT, X, C, NegPH, Flags);
3014 PL = DAG.getNode(ISD::FMA, SL, VT, X, CC, FMA0, Flags);
3015 } else {
3016 const float ch_exp = 0x1.714000p+0f;
3017 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
3018
3019 const float ch_exp10 = 0x1.a92000p+1f;
3020 const float cl_exp10 = 0x1.4f0978p-11f;
3021
3022 SDValue CH = DAG.getConstantFP(IsExp10 ? ch_exp10 : ch_exp, SL, VT);
3023 SDValue CL = DAG.getConstantFP(IsExp10 ? cl_exp10 : cl_exp, SL, VT);
3024
3025 SDValue XAsInt = DAG.getNode(ISD::BITCAST, SL, MVT::i32, X);
3026 SDValue MaskConst = DAG.getConstant(0xfffff000, SL, MVT::i32);
3027 SDValue XHAsInt = DAG.getNode(ISD::AND, SL, MVT::i32, XAsInt, MaskConst);
3028 SDValue XH = DAG.getNode(ISD::BITCAST, SL, VT, XHAsInt);
3029 SDValue XL = DAG.getNode(ISD::FSUB, SL, VT, X, XH, Flags);
3030
3031 PH = DAG.getNode(ISD::FMUL, SL, VT, XH, CH, Flags);
3032
3033 SDValue XLCL = DAG.getNode(ISD::FMUL, SL, VT, XL, CL, Flags);
3034 SDValue Mad0 = getMad(DAG, SL, VT, XL, CH, XLCL, Flags);
3035 PL = getMad(DAG, SL, VT, XH, CL, Mad0, Flags);
3036 }
3037
3038 SDValue E = DAG.getNode(ISD::FROUNDEVEN, SL, VT, PH, Flags);
3039
3040 // It is unsafe to contract this fsub into the PH multiply.
3041 SDValue PHSubE = DAG.getNode(ISD::FSUB, SL, VT, PH, E, FlagsNoContract);
3042
3043 SDValue A = DAG.getNode(ISD::FADD, SL, VT, PHSubE, PL, Flags);
3044 SDValue IntE = DAG.getNode(ISD::FP_TO_SINT, SL, MVT::i32, E);
3045 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, A, Flags);
3046
3047 SDValue R = DAG.getNode(ISD::FLDEXP, SL, VT, Exp2, IntE, Flags);
3048
3049 SDValue UnderflowCheckConst =
3050 DAG.getConstantFP(IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f, SL, VT);
3051
3052 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
3053 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
3054 SDValue Underflow =
3055 DAG.getSetCC(SL, SetCCVT, X, UnderflowCheckConst, ISD::SETOLT);
3056
3057 R = DAG.getNode(ISD::SELECT, SL, VT, Underflow, Zero, R);
3058 const auto &Options = getTargetMachine().Options;
3059
3060 if (!Flags.hasNoInfs() && !Options.NoInfsFPMath) {
3061 SDValue OverflowCheckConst =
3062 DAG.getConstantFP(IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f, SL, VT);
3063 SDValue Overflow =
3064 DAG.getSetCC(SL, SetCCVT, X, OverflowCheckConst, ISD::SETOGT);
3065 SDValue Inf =
3067 R = DAG.getNode(ISD::SELECT, SL, VT, Overflow, Inf, R);
3068 }
3069
3070 return R;
3071}
3072
3073static bool isCtlzOpc(unsigned Opc) {
3074 return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
3075}
3076
3077static bool isCttzOpc(unsigned Opc) {
3078 return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF;
3079}
3080
3082 SelectionDAG &DAG) const {
3083 auto SL = SDLoc(Op);
3084 auto Arg = Op.getOperand(0u);
3085 auto ResultVT = Op.getValueType();
3086
3087 if (ResultVT != MVT::i8 && ResultVT != MVT::i16)
3088 return {};
3089
3090 assert(isCtlzOpc(Op.getOpcode()));
3091 assert(ResultVT == Arg.getValueType());
3092
3093 auto const LeadingZeroes = 32u - ResultVT.getFixedSizeInBits();
3094 auto SubVal = DAG.getConstant(LeadingZeroes, SL, MVT::i32);
3095 auto NewOp = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Arg);
3096 NewOp = DAG.getNode(Op.getOpcode(), SL, MVT::i32, NewOp);
3097 NewOp = DAG.getNode(ISD::SUB, SL, MVT::i32, NewOp, SubVal);
3098 return DAG.getNode(ISD::TRUNCATE, SL, ResultVT, NewOp);
3099}
3100
3102 SDLoc SL(Op);
3103 SDValue Src = Op.getOperand(0);
3104
3105 assert(isCtlzOpc(Op.getOpcode()) || isCttzOpc(Op.getOpcode()));
3106 bool Ctlz = isCtlzOpc(Op.getOpcode());
3107 unsigned NewOpc = Ctlz ? AMDGPUISD::FFBH_U32 : AMDGPUISD::FFBL_B32;
3108
3109 bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF ||
3110 Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF;
3111 bool Is64BitScalar = !Src->isDivergent() && Src.getValueType() == MVT::i64;
3112
3113 if (Src.getValueType() == MVT::i32 || Is64BitScalar) {
3114 // (ctlz hi:lo) -> (umin (ffbh src), 32)
3115 // (cttz hi:lo) -> (umin (ffbl src), 32)
3116 // (ctlz_zero_undef src) -> (ffbh src)
3117 // (cttz_zero_undef src) -> (ffbl src)
3118
3119 // 64-bit scalar version produce 32-bit result
3120 // (ctlz hi:lo) -> (umin (S_FLBIT_I32_B64 src), 64)
3121 // (cttz hi:lo) -> (umin (S_FF1_I32_B64 src), 64)
3122 // (ctlz_zero_undef src) -> (S_FLBIT_I32_B64 src)
3123 // (cttz_zero_undef src) -> (S_FF1_I32_B64 src)
3124 SDValue NewOpr = DAG.getNode(NewOpc, SL, MVT::i32, Src);
3125 if (!ZeroUndef) {
3126 const SDValue ConstVal = DAG.getConstant(
3127 Op.getValueType().getScalarSizeInBits(), SL, MVT::i32);
3128 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, ConstVal);
3129 }
3130 return DAG.getNode(ISD::ZERO_EXTEND, SL, Src.getValueType(), NewOpr);
3131 }
3132
3133 SDValue Lo, Hi;
3134 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3135
3136 SDValue OprLo = DAG.getNode(NewOpc, SL, MVT::i32, Lo);
3137 SDValue OprHi = DAG.getNode(NewOpc, SL, MVT::i32, Hi);
3138
3139 // (ctlz hi:lo) -> (umin3 (ffbh hi), (uaddsat (ffbh lo), 32), 64)
3140 // (cttz hi:lo) -> (umin3 (uaddsat (ffbl hi), 32), (ffbl lo), 64)
3141 // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
3142 // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
3143
3144 unsigned AddOpc = ZeroUndef ? ISD::ADD : ISD::UADDSAT;
3145 const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32);
3146 if (Ctlz)
3147 OprLo = DAG.getNode(AddOpc, SL, MVT::i32, OprLo, Const32);
3148 else
3149 OprHi = DAG.getNode(AddOpc, SL, MVT::i32, OprHi, Const32);
3150
3151 SDValue NewOpr;
3152 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, OprLo, OprHi);
3153 if (!ZeroUndef) {
3154 const SDValue Const64 = DAG.getConstant(64, SL, MVT::i32);
3155 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const64);
3156 }
3157
3158 return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr);
3159}
3160
3162 bool Signed) const {
3163 // The regular method converting a 64-bit integer to float roughly consists of
3164 // 2 steps: normalization and rounding. In fact, after normalization, the
3165 // conversion from a 64-bit integer to a float is essentially the same as the
3166 // one from a 32-bit integer. The only difference is that it has more
3167 // trailing bits to be rounded. To leverage the native 32-bit conversion, a
3168 // 64-bit integer could be preprocessed and fit into a 32-bit integer then
3169 // converted into the correct float number. The basic steps for the unsigned
3170 // conversion are illustrated in the following pseudo code:
3171 //
3172 // f32 uitofp(i64 u) {
3173 // i32 hi, lo = split(u);
3174 // // Only count the leading zeros in hi as we have native support of the
3175 // // conversion from i32 to f32. If hi is all 0s, the conversion is
3176 // // reduced to a 32-bit one automatically.
3177 // i32 shamt = clz(hi); // Return 32 if hi is all 0s.
3178 // u <<= shamt;
3179 // hi, lo = split(u);
3180 // hi |= (lo != 0) ? 1 : 0; // Adjust rounding bit in hi based on lo.
3181 // // convert it as a 32-bit integer and scale the result back.
3182 // return uitofp(hi) * 2^(32 - shamt);
3183 // }
3184 //
3185 // The signed one follows the same principle but uses 'ffbh_i32' to count its
3186 // sign bits instead. If 'ffbh_i32' is not available, its absolute value is
3187 // converted instead followed by negation based its sign bit.
3188
3189 SDLoc SL(Op);
3190 SDValue Src = Op.getOperand(0);
3191
3192 SDValue Lo, Hi;
3193 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3194 SDValue Sign;
3195 SDValue ShAmt;
3196 if (Signed && Subtarget->isGCN()) {
3197 // We also need to consider the sign bit in Lo if Hi has just sign bits,
3198 // i.e. Hi is 0 or -1. However, that only needs to take the MSB into
3199 // account. That is, the maximal shift is
3200 // - 32 if Lo and Hi have opposite signs;
3201 // - 33 if Lo and Hi have the same sign.
3202 //
3203 // Or, MaxShAmt = 33 + OppositeSign, where
3204 //
3205 // OppositeSign is defined as ((Lo ^ Hi) >> 31), which is
3206 // - -1 if Lo and Hi have opposite signs; and
3207 // - 0 otherwise.
3208 //
3209 // All in all, ShAmt is calculated as
3210 //
3211 // umin(sffbh(Hi), 33 + (Lo^Hi)>>31) - 1.
3212 //
3213 // or
3214 //
3215 // umin(sffbh(Hi) - 1, 32 + (Lo^Hi)>>31).
3216 //
3217 // to reduce the critical path.
3218 SDValue OppositeSign = DAG.getNode(
3219 ISD::SRA, SL, MVT::i32, DAG.getNode(ISD::XOR, SL, MVT::i32, Lo, Hi),
3220 DAG.getConstant(31, SL, MVT::i32));
3221 SDValue MaxShAmt =
3222 DAG.getNode(ISD::ADD, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
3223 OppositeSign);
3224 // Count the leading sign bits.
3225 ShAmt = DAG.getNode(AMDGPUISD::FFBH_I32, SL, MVT::i32, Hi);
3226 // Different from unsigned conversion, the shift should be one bit less to
3227 // preserve the sign bit.
3228 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, ShAmt,
3229 DAG.getConstant(1, SL, MVT::i32));
3230 ShAmt = DAG.getNode(ISD::UMIN, SL, MVT::i32, ShAmt, MaxShAmt);
3231 } else {
3232 if (Signed) {
3233 // Without 'ffbh_i32', only leading zeros could be counted. Take the
3234 // absolute value first.
3235 Sign = DAG.getNode(ISD::SRA, SL, MVT::i64, Src,
3236 DAG.getConstant(63, SL, MVT::i64));
3237 SDValue Abs =
3238 DAG.getNode(ISD::XOR, SL, MVT::i64,
3239 DAG.getNode(ISD::ADD, SL, MVT::i64, Src, Sign), Sign);
3240 std::tie(Lo, Hi) = split64BitValue(Abs, DAG);
3241 }
3242 // Count the leading zeros.
3243 ShAmt = DAG.getNode(ISD::CTLZ, SL, MVT::i32, Hi);
3244 // The shift amount for signed integers is [0, 32].
3245 }
3246 // Normalize the given 64-bit integer.
3247 SDValue Norm = DAG.getNode(ISD::SHL, SL, MVT::i64, Src, ShAmt);
3248 // Split it again.
3249 std::tie(Lo, Hi) = split64BitValue(Norm, DAG);
3250 // Calculate the adjust bit for rounding.
3251 // (lo != 0) ? 1 : 0 => (lo >= 1) ? 1 : 0 => umin(1, lo)
3252 SDValue Adjust = DAG.getNode(ISD::UMIN, SL, MVT::i32,
3253 DAG.getConstant(1, SL, MVT::i32), Lo);
3254 // Get the 32-bit normalized integer.
3255 Norm = DAG.getNode(ISD::OR, SL, MVT::i32, Hi, Adjust);
3256 // Convert the normalized 32-bit integer into f32.
3257 unsigned Opc =
3258 (Signed && Subtarget->isGCN()) ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
3259 SDValue FVal = DAG.getNode(Opc, SL, MVT::f32, Norm);
3260
3261 // Finally, need to scale back the converted floating number as the original
3262 // 64-bit integer is converted as a 32-bit one.
3263 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
3264 ShAmt);
3265 // On GCN, use LDEXP directly.
3266 if (Subtarget->isGCN())
3267 return DAG.getNode(ISD::FLDEXP, SL, MVT::f32, FVal, ShAmt);
3268
3269 // Otherwise, align 'ShAmt' to the exponent part and add it into the exponent
3270 // part directly to emulate the multiplication of 2^ShAmt. That 8-bit
3271 // exponent is enough to avoid overflowing into the sign bit.
3272 SDValue Exp = DAG.getNode(ISD::SHL, SL, MVT::i32, ShAmt,
3273 DAG.getConstant(23, SL, MVT::i32));
3274 SDValue IVal =
3275 DAG.getNode(ISD::ADD, SL, MVT::i32,
3276 DAG.getNode(ISD::BITCAST, SL, MVT::i32, FVal), Exp);
3277 if (Signed) {
3278 // Set the sign bit.
3279 Sign = DAG.getNode(ISD::SHL, SL, MVT::i32,
3280 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Sign),
3281 DAG.getConstant(31, SL, MVT::i32));
3282 IVal = DAG.getNode(ISD::OR, SL, MVT::i32, IVal, Sign);
3283 }
3284 return DAG.getNode(ISD::BITCAST, SL, MVT::f32, IVal);
3285}
3286
3288 bool Signed) const {
3289 SDLoc SL(Op);
3290 SDValue Src = Op.getOperand(0);
3291
3292 SDValue Lo, Hi;
3293 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3294
3296 SL, MVT::f64, Hi);
3297
3298 SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
3299
3300 SDValue LdExp = DAG.getNode(ISD::FLDEXP, SL, MVT::f64, CvtHi,
3301 DAG.getConstant(32, SL, MVT::i32));
3302 // TODO: Should this propagate fast-math-flags?
3303 return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
3304}
3305
3307 SelectionDAG &DAG) const {
3308 // TODO: Factor out code common with LowerSINT_TO_FP.
3309 EVT DestVT = Op.getValueType();
3310 SDValue Src = Op.getOperand(0);
3311 EVT SrcVT = Src.getValueType();
3312
3313 if (SrcVT == MVT::i16) {
3314 if (DestVT == MVT::f16)
3315 return Op;
3316 SDLoc DL(Op);
3317
3318 // Promote src to i32
3319 SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Src);
3320 return DAG.getNode(ISD::UINT_TO_FP, DL, DestVT, Ext);
3321 }
3322
3323 if (DestVT == MVT::bf16) {
3324 SDLoc SL(Op);
3325 SDValue ToF32 = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f32, Src);
3326 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true);
3327 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag);
3328 }
3329
3330 if (SrcVT != MVT::i64)
3331 return Op;
3332
3333 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
3334 SDLoc DL(Op);
3335
3336 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
3337 SDValue FPRoundFlag =
3338 DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);
3339 SDValue FPRound =
3340 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
3341
3342 return FPRound;
3343 }
3344
3345 if (DestVT == MVT::f32)
3346 return LowerINT_TO_FP32(Op, DAG, false);
3347
3348 assert(DestVT == MVT::f64);
3349 return LowerINT_TO_FP64(Op, DAG, false);
3350}
3351
3353 SelectionDAG &DAG) const {
3354 EVT DestVT = Op.getValueType();
3355
3356 SDValue Src = Op.getOperand(0);
3357 EVT SrcVT = Src.getValueType();
3358
3359 if (SrcVT == MVT::i16) {
3360 if (DestVT == MVT::f16)
3361 return Op;
3362
3363 SDLoc DL(Op);
3364 // Promote src to i32
3365 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, Src);
3366 return DAG.getNode(ISD::SINT_TO_FP, DL, DestVT, Ext);
3367 }
3368
3369 if (DestVT == MVT::bf16) {
3370 SDLoc SL(Op);
3371 SDValue ToF32 = DAG.getNode(ISD::SINT_TO_FP, SL, MVT::f32, Src);
3372 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true);
3373 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag);
3374 }
3375
3376 if (SrcVT != MVT::i64)
3377 return Op;
3378
3379 // TODO: Factor out code common with LowerUINT_TO_FP.
3380
3381 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
3382 SDLoc DL(Op);
3383 SDValue Src = Op.getOperand(0);
3384
3385 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
3386 SDValue FPRoundFlag =
3387 DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);
3388 SDValue FPRound =
3389 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
3390
3391 return FPRound;
3392 }
3393
3394 if (DestVT == MVT::f32)
3395 return LowerINT_TO_FP32(Op, DAG, true);
3396
3397 assert(DestVT == MVT::f64);
3398 return LowerINT_TO_FP64(Op, DAG, true);
3399}
3400
3402 bool Signed) const {
3403 SDLoc SL(Op);
3404
3405 SDValue Src = Op.getOperand(0);
3406 EVT SrcVT = Src.getValueType();
3407
3408 assert(SrcVT == MVT::f32 || SrcVT == MVT::f64);
3409
3410 // The basic idea of converting a floating point number into a pair of 32-bit
3411 // integers is illustrated as follows:
3412 //
3413 // tf := trunc(val);
3414 // hif := floor(tf * 2^-32);
3415 // lof := tf - hif * 2^32; // lof is always positive due to floor.
3416 // hi := fptoi(hif);
3417 // lo := fptoi(lof);
3418 //
3419 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, SrcVT, Src);
3420 SDValue Sign;
3421 if (Signed && SrcVT == MVT::f32) {
3422 // However, a 32-bit floating point number has only 23 bits mantissa and
3423 // it's not enough to hold all the significant bits of `lof` if val is
3424 // negative. To avoid the loss of precision, We need to take the absolute
3425 // value after truncating and flip the result back based on the original
3426 // signedness.
3427 Sign = DAG.getNode(ISD::SRA, SL, MVT::i32,
3428 DAG.getNode(ISD::BITCAST, SL, MVT::i32, Trunc),
3429 DAG.getConstant(31, SL, MVT::i32));
3430 Trunc = DAG.getNode(ISD::FABS, SL, SrcVT, Trunc);
3431 }
3432
3433 SDValue K0, K1;
3434 if (SrcVT == MVT::f64) {
3435 K0 = DAG.getConstantFP(
3436 llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)), SL,
3437 SrcVT);
3438 K1 = DAG.getConstantFP(
3439 llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)), SL,
3440 SrcVT);
3441 } else {
3442 K0 = DAG.getConstantFP(
3443 llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)), SL, SrcVT);
3444 K1 = DAG.getConstantFP(
3445 llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)), SL, SrcVT);
3446 }
3447 // TODO: Should this propagate fast-math-flags?
3448 SDValue Mul = DAG.getNode(ISD::FMUL, SL, SrcVT, Trunc, K0);
3449
3450 SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, SrcVT, Mul);
3451
3452 SDValue Fma = DAG.getNode(ISD::FMA, SL, SrcVT, FloorMul, K1, Trunc);
3453
3454 SDValue Hi = DAG.getNode((Signed && SrcVT == MVT::f64) ? ISD::FP_TO_SINT
3456 SL, MVT::i32, FloorMul);
3457 SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
3458
3459 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
3460 DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi}));
3461
3462 if (Signed && SrcVT == MVT::f32) {
3463 assert(Sign);
3464 // Flip the result based on the signedness, which is either all 0s or 1s.
3465 Sign = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
3466 DAG.getBuildVector(MVT::v2i32, SL, {Sign, Sign}));
3467 // r := xor(r, sign) - sign;
3468 Result =
3469 DAG.getNode(ISD::SUB, SL, MVT::i64,
3470 DAG.getNode(ISD::XOR, SL, MVT::i64, Result, Sign), Sign);
3471 }
3472
3473 return Result;
3474}
3475
3477 SDLoc DL(Op);
3478 SDValue N0 = Op.getOperand(0);
3479
3480 // Convert to target node to get known bits
3481 if (N0.getValueType() == MVT::f32)
3482 return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0);
3483
3484 if (getTargetMachine().Options.UnsafeFPMath) {
3485 // There is a generic expand for FP_TO_FP16 with unsafe fast math.
3486 return SDValue();
3487 }
3488
3489 assert(N0.getSimpleValueType() == MVT::f64);
3490
3491 // f64 -> f16 conversion using round-to-nearest-even rounding mode.
3492 const unsigned ExpMask = 0x7ff;
3493 const unsigned ExpBiasf64 = 1023;
3494 const unsigned ExpBiasf16 = 15;
3495 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
3496 SDValue One = DAG.getConstant(1, DL, MVT::i32);
3497 SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, N0);
3498 SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U,
3499 DAG.getConstant(32, DL, MVT::i64));
3500 UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32);
3501 U = DAG.getZExtOrTrunc(U, DL, MVT::i32);
3502 SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3503 DAG.getConstant(20, DL, MVT::i64));
3504 E = DAG.getNode(ISD::AND, DL, MVT::i32, E,
3505 DAG.getConstant(ExpMask, DL, MVT::i32));
3506 // Subtract the fp64 exponent bias (1023) to get the real exponent and
3507 // add the f16 bias (15) to get the biased exponent for the f16 format.
3508 E = DAG.getNode(ISD::ADD, DL, MVT::i32, E,
3509 DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32));
3510
3511 SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3512 DAG.getConstant(8, DL, MVT::i32));
3513 M = DAG.getNode(ISD::AND, DL, MVT::i32, M,
3514 DAG.getConstant(0xffe, DL, MVT::i32));
3515
3516 SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH,
3517 DAG.getConstant(0x1ff, DL, MVT::i32));
3518 MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U);
3519
3520 SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ);
3521 M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set);
3522
3523 // (M != 0 ? 0x0200 : 0) | 0x7c00;
3524 SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32,
3525 DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32),
3526 Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32));
3527
3528 // N = M | (E << 12);
3529 SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M,
3530 DAG.getNode(ISD::SHL, DL, MVT::i32, E,
3531 DAG.getConstant(12, DL, MVT::i32)));
3532
3533 // B = clamp(1-E, 0, 13);
3534 SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32,
3535 One, E);
3536 SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero);
3537 B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B,
3538 DAG.getConstant(13, DL, MVT::i32));
3539
3540 SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M,
3541 DAG.getConstant(0x1000, DL, MVT::i32));
3542
3543 SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B);
3544 SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B);
3545 SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE);
3546 D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1);
3547
3548 SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT);
3549 SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V,
3550 DAG.getConstant(0x7, DL, MVT::i32));
3551 V = DAG.getNode(ISD::SRL, DL, MVT::i32, V,
3552 DAG.getConstant(2, DL, MVT::i32));
3553 SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32),
3554 One, Zero, ISD::SETEQ);
3555 SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32),
3556 One, Zero, ISD::SETGT);
3557 V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1);
3558 V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1);
3559
3560 V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32),
3561 DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT);
3562 V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32),
3563 I, V, ISD::SETEQ);
3564
3565 // Extract the sign bit.
3566 SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3567 DAG.getConstant(16, DL, MVT::i32));
3568 Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign,
3569 DAG.getConstant(0x8000, DL, MVT::i32));
3570
3571 V = DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V);
3572 return DAG.getZExtOrTrunc(V, DL, Op.getValueType());
3573}
3574
3576 SelectionDAG &DAG) const {
3577 SDValue Src = Op.getOperand(0);
3578 unsigned OpOpcode = Op.getOpcode();
3579 EVT SrcVT = Src.getValueType();
3580 EVT DestVT = Op.getValueType();
3581
3582 // Will be selected natively
3583 if (SrcVT == MVT::f16 && DestVT == MVT::i16)
3584 return Op;
3585
3586 if (SrcVT == MVT::bf16) {
3587 SDLoc DL(Op);
3588 SDValue PromotedSrc = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
3589 return DAG.getNode(Op.getOpcode(), DL, DestVT, PromotedSrc);
3590 }
3591
3592 // Promote i16 to i32
3593 if (DestVT == MVT::i16 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
3594 SDLoc DL(Op);
3595
3596 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
3597 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToInt32);
3598 }
3599
3600 if (DestVT != MVT::i64)
3601 return Op;
3602
3603 if (SrcVT == MVT::f16 ||
3604 (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {
3605 SDLoc DL(Op);
3606
3607 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
3608 unsigned Ext =
3610 return DAG.getNode(Ext, DL, MVT::i64, FpToInt32);
3611 }
3612
3613 if (SrcVT == MVT::f32 || SrcVT == MVT::f64)
3614 return LowerFP_TO_INT64(Op, DAG, OpOpcode == ISD::FP_TO_SINT);
3615
3616 return SDValue();
3617}
3618
3620 SelectionDAG &DAG) const {
3621 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
3622 MVT VT = Op.getSimpleValueType();
3623 MVT ScalarVT = VT.getScalarType();
3624
3625 assert(VT.isVector());
3626
3627 SDValue Src = Op.getOperand(0);
3628 SDLoc DL(Op);
3629
3630 // TODO: Don't scalarize on Evergreen?
3631 unsigned NElts = VT.getVectorNumElements();
3633 DAG.ExtractVectorElements(Src, Args, 0, NElts);
3634
3635 SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
3636 for (unsigned I = 0; I < NElts; ++I)
3637 Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
3638
3639 return DAG.getBuildVector(VT, DL, Args);
3640}
3641
3642//===----------------------------------------------------------------------===//
3643// Custom DAG optimizations
3644//===----------------------------------------------------------------------===//
3645
3646static bool isU24(SDValue Op, SelectionDAG &DAG) {
3647 return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24;
3648}
3649
3650static bool isI24(SDValue Op, SelectionDAG &DAG) {
3651 EVT VT = Op.getValueType();
3652 return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
3653 // as unsigned 24-bit values.
3655}
3656
3659 SelectionDAG &DAG = DCI.DAG;
3660 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
3661 bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
3662
3663 SDValue LHS = IsIntrin ? Node24->getOperand(1) : Node24->getOperand(0);
3664 SDValue RHS = IsIntrin ? Node24->getOperand(2) : Node24->getOperand(1);
3665 unsigned NewOpcode = Node24->getOpcode();
3666 if (IsIntrin) {
3667 unsigned IID = Node24->getConstantOperandVal(0);
3668 switch (IID) {
3669 case Intrinsic::amdgcn_mul_i24:
3670 NewOpcode = AMDGPUISD::MUL_I24;
3671 break;
3672 case Intrinsic::amdgcn_mul_u24:
3673 NewOpcode = AMDGPUISD::MUL_U24;
3674 break;
3675 case Intrinsic::amdgcn_mulhi_i24:
3676 NewOpcode = AMDGPUISD::MULHI_I24;
3677 break;
3678 case Intrinsic::amdgcn_mulhi_u24:
3679 NewOpcode = AMDGPUISD::MULHI_U24;
3680 break;
3681 default:
3682 llvm_unreachable("Expected 24-bit mul intrinsic");
3683 }
3684 }
3685
3686 APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24);
3687
3688 // First try to simplify using SimplifyMultipleUseDemandedBits which allows
3689 // the operands to have other uses, but will only perform simplifications that
3690 // involve bypassing some nodes for this user.
3691 SDValue DemandedLHS = TLI.SimplifyMultipleUseDemandedBits(LHS, Demanded, DAG);
3692 SDValue DemandedRHS = TLI.SimplifyMultipleUseDemandedBits(RHS, Demanded, DAG);
3693 if (DemandedLHS || DemandedRHS)
3694 return DAG.getNode(NewOpcode, SDLoc(Node24), Node24->getVTList(),
3695 DemandedLHS ? DemandedLHS : LHS,
3696 DemandedRHS ? DemandedRHS : RHS);
3697
3698 // Now try SimplifyDemandedBits which can simplify the nodes used by our
3699 // operands if this node is the only user.
3700 if (TLI.SimplifyDemandedBits(LHS, Demanded, DCI))
3701 return SDValue(Node24, 0);
3702 if (TLI.SimplifyDemandedBits(RHS, Demanded, DCI))
3703 return SDValue(Node24, 0);
3704
3705 return SDValue();
3706}
3707
3708template <typename IntTy>
3710 uint32_t Width, const SDLoc &DL) {
3711 if (Width + Offset < 32) {
3712 uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
3713 IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
3714 return DAG.getConstant(Result, DL, MVT::i32);
3715 }
3716
3717 return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
3718}
3719
3720static bool hasVolatileUser(SDNode *Val) {
3721 for (SDNode *U : Val->uses()) {
3722 if (MemSDNode *M = dyn_cast<MemSDNode>(U)) {
3723 if (M->isVolatile())
3724 return true;
3725 }
3726 }
3727
3728 return false;
3729}
3730
3732 // i32 vectors are the canonical memory type.
3733 if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
3734 return false;
3735
3736 if (!VT.isByteSized())
3737 return false;
3738
3739 unsigned Size = VT.getStoreSize();
3740
3741 if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
3742 return false;
3743
3744 if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
3745 return false;
3746
3747 return true;
3748}
3749
3750// Replace load of an illegal type with a store of a bitcast to a friendlier
3751// type.
3753 DAGCombinerInfo &DCI) const {
3754 if (!DCI.isBeforeLegalize())
3755 return SDValue();
3756
3757 LoadSDNode *LN = cast<LoadSDNode>(N);
3758 if (!LN->isSimple() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
3759 return SDValue();
3760
3761 SDLoc SL(N);
3762 SelectionDAG &DAG = DCI.DAG;
3763 EVT VT = LN->getMemoryVT();
3764
3765 unsigned Size = VT.getStoreSize();
3766 Align Alignment = LN->getAlign();
3767 if (Alignment < Size && isTypeLegal(VT)) {
3768 unsigned IsFast;
3769 unsigned AS = LN->getAddressSpace();
3770
3771 // Expand unaligned loads earlier than legalization. Due to visitation order
3772 // problems during legalization, the emitted instructions to pack and unpack
3773 // the bytes again are not eliminated in the case of an unaligned copy.
3775 VT, AS, Alignment, LN->getMemOperand()->getFlags(), &IsFast)) {
3776 if (VT.isVector())
3777 return SplitVectorLoad(SDValue(LN, 0), DAG);
3778
3779 SDValue Ops[2];
3780 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
3781
3782 return DAG.getMergeValues(Ops, SDLoc(N));
3783 }
3784
3785 if (!IsFast)
3786 return SDValue();
3787 }
3788
3789 if (!shouldCombineMemoryType(VT))
3790 return SDValue();
3791
3792 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3793
3794 SDValue NewLoad
3795 = DAG.getLoad(NewVT, SL, LN->getChain(),
3796 LN->getBasePtr(), LN->getMemOperand());
3797
3798 SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad);
3799 DCI.CombineTo(N, BC, NewLoad.getValue(1));
3800 return SDValue(N, 0);
3801}
3802
3803// Replace store of an illegal type with a store of a bitcast to a friendlier
3804// type.
3806 DAGCombinerInfo &DCI) const {
3807 if (!DCI.isBeforeLegalize())
3808 return SDValue();
3809
3810 StoreSDNode *SN = cast<StoreSDNode>(N);
3811 if (!SN->isSimple() || !ISD::isNormalStore(SN))
3812 return SDValue();
3813
3814 EVT VT = SN->getMemoryVT();
3815 unsigned Size = VT.getStoreSize();
3816
3817 SDLoc SL(N);
3818 SelectionDAG &DAG = DCI.DAG;
3819 Align Alignment = SN->getAlign();
3820 if (Alignment < Size && isTypeLegal(VT)) {
3821 unsigned IsFast;
3822 unsigned AS = SN->getAddressSpace();
3823
3824 // Expand unaligned stores earlier than legalization. Due to visitation
3825 // order problems during legalization, the emitted instructions to pack and
3826 // unpack the bytes again are not eliminated in the case of an unaligned
3827 // copy.
3829 VT, AS, Alignment, SN->getMemOperand()->getFlags(), &IsFast)) {
3830 if (VT.isVector())
3831 return SplitVectorStore(SDValue(SN, 0), DAG);
3832
3833 return expandUnalignedStore(SN, DAG);
3834 }
3835
3836 if (!IsFast)
3837 return SDValue();
3838 }
3839
3840 if (!shouldCombineMemoryType(VT))
3841 return SDValue();
3842
3843 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3844 SDValue Val = SN->getValue();
3845
3846 //DCI.AddToWorklist(Val.getNode());
3847
3848 bool OtherUses = !Val.hasOneUse();
3849 SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val);
3850 if (OtherUses) {
3851 SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal);
3852 DAG.ReplaceAllUsesOfValueWith(Val, CastBack);
3853 }
3854
3855 return DAG.getStore(SN->getChain(), SL, CastVal,
3856 SN->getBasePtr(), SN->getMemOperand());
3857}
3858
3859// FIXME: This should go in generic DAG combiner with an isTruncateFree check,
3860// but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
3861// issues.
3863 DAGCombinerInfo &DCI) const {
3864 SelectionDAG &DAG = DCI.DAG;
3865 SDValue N0 = N->getOperand(0);
3866
3867 // (vt2 (assertzext (truncate vt0:x), vt1)) ->
3868 // (vt2 (truncate (assertzext vt0:x, vt1)))
3869 if (N0.getOpcode() == ISD::TRUNCATE) {
3870 SDValue N1 = N->getOperand(1);
3871 EVT ExtVT = cast<VTSDNode>(N1)->getVT();
3872 SDLoc SL(N);
3873
3874 SDValue Src = N0.getOperand(0);
3875 EVT SrcVT = Src.getValueType();
3876 if (SrcVT.bitsGE(ExtVT)) {
3877 SDValue NewInReg = DAG.getNode(N->getOpcode(), SL, SrcVT, Src, N1);
3878 return DAG.getNode(ISD::TRUNCATE, SL, N->getValueType(0), NewInReg);
3879 }
3880 }
3881
3882 return SDValue();
3883}
3884
3886 SDNode *N, DAGCombinerInfo &DCI) const {
3887 unsigned IID = N->getConstantOperandVal(0);
3888 switch (IID) {
3889 case Intrinsic::amdgcn_mul_i24:
3890 case Intrinsic::amdgcn_mul_u24:
3891 case Intrinsic::amdgcn_mulhi_i24:
3892 case Intrinsic::amdgcn_mulhi_u24:
3893 return simplifyMul24(N, DCI);
3894 case Intrinsic::amdgcn_fract:
3895 case Intrinsic::amdgcn_rsq:
3896 case Intrinsic::amdgcn_rcp_legacy:
3897 case Intrinsic::amdgcn_rsq_legacy:
3898 case Intrinsic::amdgcn_rsq_clamp: {
3899 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
3900 SDValue Src = N->getOperand(1);
3901 return Src.isUndef() ? Src : SDValue();
3902 }
3903 case Intrinsic::amdgcn_frexp_exp: {
3904 // frexp_exp (fneg x) -> frexp_exp x
3905 // frexp_exp (fabs x) -> frexp_exp x
3906 // frexp_exp (fneg (fabs x)) -> frexp_exp x
3907 SDValue Src = N->getOperand(1);
3908 SDValue PeekSign = peekFPSignOps(Src);
3909 if (PeekSign == Src)
3910 return SDValue();
3911 return SDValue(DCI.DAG.UpdateNodeOperands(N, N->getOperand(0), PeekSign),
3912 0);
3913 }
3914 default:
3915 return SDValue();
3916 }
3917}
3918
3919/// Split the 64-bit value \p LHS into two 32-bit components, and perform the
3920/// binary operation \p Opc to it with the corresponding constant operands.
3922 DAGCombinerInfo &DCI, const SDLoc &SL,
3923 unsigned Opc, SDValue LHS,
3924 uint32_t ValLo, uint32_t ValHi) const {
3925 SelectionDAG &DAG = DCI.DAG;
3926 SDValue Lo, Hi;
3927 std::tie(Lo, Hi) = split64BitValue(LHS, DAG);
3928
3929 SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32);
3930 SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32);
3931
3932 SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS);
3933 SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS);
3934
3935 // Re-visit the ands. It's possible we eliminated one of them and it could
3936 // simplify the vector.
3937 DCI.AddToWorklist(Lo.getNode());
3938 DCI.AddToWorklist(Hi.getNode());
3939
3940 SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd});
3941 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
3942}
3943
3945 DAGCombinerInfo &DCI) const {
3946 EVT VT = N->getValueType(0);
3947
3948 ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3949 if (!RHS)
3950 return SDValue();
3951
3952 SDValue LHS = N->getOperand(0);
3953 unsigned RHSVal = RHS->getZExtValue();
3954 if (!RHSVal)
3955 return LHS;
3956
3957 SDLoc SL(N);
3958 SelectionDAG &DAG = DCI.DAG;
3959
3960 switch (LHS->getOpcode()) {
3961 default:
3962 break;
3963 case ISD::ZERO_EXTEND:
3964 case ISD::SIGN_EXTEND:
3965 case ISD::ANY_EXTEND: {
3966 SDValue X = LHS->getOperand(0);
3967
3968 if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 &&
3969 isOperationLegal(ISD::BUILD_VECTOR, MVT::v2i16)) {
3970 // Prefer build_vector as the canonical form if packed types are legal.
3971 // (shl ([asz]ext i16:x), 16 -> build_vector 0, x
3972 SDValue Vec = DAG.getBuildVector(MVT::v2i16, SL,
3973 { DAG.getConstant(0, SL, MVT::i16), LHS->getOperand(0) });
3974 return DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
3975 }
3976
3977 // shl (ext x) => zext (shl x), if shift does not overflow int
3978 if (VT != MVT::i64)
3979 break;
3980 KnownBits Known = DAG.computeKnownBits(X);
3981 unsigned LZ = Known.countMinLeadingZeros();
3982 if (LZ < RHSVal)
3983 break;
3984 EVT XVT = X.getValueType();
3985 SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0));
3986 return DAG.getZExtOrTrunc(Shl, SL, VT);
3987 }
3988 }
3989
3990 if (VT != MVT::i64)
3991 return SDValue();
3992
3993 // i64 (shl x, C) -> (build_pair 0, (shl x, C -32))
3994
3995 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
3996 // common case, splitting this into a move and a 32-bit shift is faster and
3997 // the same code size.
3998 if (RHSVal < 32)
3999 return SDValue();
4000
4001 SDValue ShiftAmt = DAG.getConstant(RHSVal - 32, SL, MVT::i32);
4002
4003 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
4004 SDValue NewShift = DAG.getNode(ISD::SHL, SL, MVT::i32, Lo, ShiftAmt);
4005
4006 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
4007
4008 SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {Zero, NewShift});
4009 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
4010}
4011
4013 DAGCombinerInfo &DCI) const {
4014 if (N->getValueType(0) != MVT::i64)
4015 return SDValue();
4016
4017 const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
4018 if (!RHS)
4019 return SDValue();
4020
4021 SelectionDAG &DAG = DCI.DAG;
4022 SDLoc SL(N);
4023 unsigned RHSVal = RHS->getZExtValue();
4024
4025 // (sra i64:x, 32) -> build_pair x, (sra hi_32(x), 31)
4026 if (RHSVal == 32) {
4027 SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
4028 SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
4029 DAG.getConstant(31, SL, MVT::i32));
4030
4031 SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {Hi, NewShift});
4032 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
4033 }
4034
4035 // (sra i64:x, 63) -> build_pair (sra hi_32(x), 31), (sra hi_32(x), 31)
4036 if (RHSVal == 63) {
4037 SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
4038 SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
4039 DAG.getConstant(31, SL, MVT::i32));
4040 SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, NewShift});
4041 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
4042 }
4043
4044 return SDValue();
4045}
4046
4048 DAGCombinerInfo &DCI) const {
4049 auto *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
4050 if (!RHS)
4051 return SDValue();
4052
4053 EVT VT = N->getValueType(0);
4054 SDValue LHS = N->getOperand(0);
4055 unsigned ShiftAmt = RHS->getZExtValue();
4056 SelectionDAG &DAG = DCI.DAG;
4057 SDLoc SL(N);
4058
4059 // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1)
4060 // this improves the ability to match BFE patterns in isel.
4061 if (LHS.getOpcode() == ISD::AND) {
4062 if (auto *Mask = dyn_cast<ConstantSDNode>(LHS.getOperand(1))) {
4063 unsigned MaskIdx, MaskLen;
4064 if (Mask->getAPIntValue().isShiftedMask(MaskIdx, MaskLen) &&
4065 MaskIdx == ShiftAmt) {
4066 return DAG.getNode(
4067 ISD::AND, SL, VT,
4068 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0), N->getOperand(1)),
4069 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(1), N->getOperand(1)));
4070 }
4071 }
4072 }
4073
4074 if (VT != MVT::i64)
4075 return SDValue();
4076
4077 if (ShiftAmt < 32)
4078 return SDValue();
4079
4080 // srl i64:x, C for C >= 32
4081 // =>
4082 // build_pair (srl hi_32(x), C - 32), 0
4083 SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
4084
4085 SDValue Hi = getHiHalf64(LHS, DAG);
4086
4087 SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32);
4088 SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst);
4089
4090 SDValue BuildPair = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, Zero});
4091
4092 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair);
4093}
4094
4096 SDNode *N, DAGCombinerInfo &DCI) const {
4097 SDLoc SL(N);
4098 SelectionDAG &DAG = DCI.DAG;
4099 EVT VT = N->getValueType(0);
4100 SDValue Src = N->getOperand(0);
4101
4102 // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x)
4103 if (Src.getOpcode() == ISD::BITCAST && !VT.isVector()) {
4104 SDValue Vec = Src.getOperand(0);
4105 if (Vec.getOpcode() == ISD::BUILD_VECTOR) {
4106 SDValue Elt0 = Vec.getOperand(0);
4107 EVT EltVT = Elt0.getValueType();
4108 if (VT.getFixedSizeInBits() <= EltVT.getFixedSizeInBits()) {
4109 if (EltVT.isFloatingPoint()) {
4110 Elt0 = DAG.getNode(ISD::BITCAST, SL,
4111 EltVT.changeTypeToInteger(), Elt0);
4112 }
4113
4114 return DAG.getNode(ISD::TRUNCATE, SL, VT, Elt0);
4115 }
4116 }
4117 }
4118
4119 // Equivalent of above for accessing the high element of a vector as an
4120 // integer operation.
4121 // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)
4122 if (Src.getOpcode() == ISD::SRL && !VT.isVector()) {
4123 if (auto K = isConstOrConstSplat(Src.getOperand(1))) {
4124 if (2 * K->getZExtValue() == Src.getValueType().getScalarSizeInBits()) {
4125 SDValue BV = stripBitcast(Src.getOperand(0));
4126 if (BV.getOpcode() == ISD::BUILD_VECTOR &&
4127 BV.getValueType().getVectorNumElements() == 2) {
4128 SDValue SrcElt = BV.getOperand(1);
4129 EVT SrcEltVT = SrcElt.getValueType();
4130 if (SrcEltVT.isFloatingPoint()) {
4131 SrcElt = DAG.getNode(ISD::BITCAST, SL,
4132 SrcEltVT.changeTypeToInteger(), SrcElt);
4133 }
4134
4135 return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt);
4136 }
4137 }
4138 }
4139 }
4140
4141 // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.
4142 //
4143 // i16 (trunc (srl i64:x, K)), K <= 16 ->
4144 // i16 (trunc (srl (i32 (trunc x), K)))
4145 if (VT.getScalarSizeInBits() < 32) {
4146 EVT SrcVT = Src.getValueType();
4147 if (SrcVT.getScalarSizeInBits() > 32 &&
4148 (Src.getOpcode() == ISD::SRL ||
4149 Src.getOpcode() == ISD::SRA ||
4150 Src.getOpcode() == ISD::SHL)) {
4151 SDValue Amt = Src.getOperand(1);
4152 KnownBits Known = DAG.computeKnownBits(Amt);
4153
4154 // - For left shifts, do the transform as long as the shift
4155 // amount is still legal for i32, so when ShiftAmt < 32 (<= 31)
4156 // - For right shift, do it if ShiftAmt <= (32 - Size) to avoid
4157 // losing information stored in the high bits when truncating.
4158 const unsigned MaxCstSize =
4159 (Src.getOpcode() == ISD::SHL) ? 31 : (32 - VT.getScalarSizeInBits());
4160 if (Known.getMaxValue().ule(MaxCstSize)) {
4161 EVT MidVT = VT.isVector() ?
4162 EVT::getVectorVT(*DAG.getContext(), MVT::i32,
4163 VT.getVectorNumElements()) : MVT::i32;
4164
4165 EVT NewShiftVT = getShiftAmountTy(MidVT, DAG.getDataLayout());
4166 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MidVT,
4167 Src.getOperand(0));
4168 DCI.AddToWorklist(Trunc.getNode());
4169
4170 if (Amt.getValueType() != NewShiftVT) {
4171 Amt = DAG.getZExtOrTrunc(Amt, SL, NewShiftVT);
4172 DCI.AddToWorklist(Amt.getNode());
4173 }
4174
4175 SDValue ShrunkShift = DAG.getNode(Src.getOpcode(), SL, MidVT,
4176 Trunc, Amt);
4177 return DAG.getNode(ISD::TRUNCATE, SL, VT, ShrunkShift);
4178 }
4179 }
4180 }
4181
4182 return SDValue();
4183}
4184
4185// We need to specifically handle i64 mul here to avoid unnecessary conversion
4186// instructions. If we only match on the legalized i64 mul expansion,
4187// SimplifyDemandedBits will be unable to remove them because there will be
4188// multiple uses due to the separate mul + mulh[su].
4189static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
4190 SDValue N0, SDValue N1, unsigned Size, bool Signed) {
4191 if (Size <= 32) {
4192 unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
4193 return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1);
4194 }
4195
4196 unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
4197 unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
4198
4199 SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1);
4200 SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1);
4201
4202 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, MulLo, MulHi);
4203}
4204
4205/// If \p V is an add of a constant 1, returns the other operand. Otherwise
4206/// return SDValue().
4207static SDValue getAddOneOp(const SDNode *V) {
4208 if (V->getOpcode() != ISD::ADD)
4209 return SDValue();
4210
4211 return isOneConstant(V->getOperand(1)) ? V->getOperand(0) : SDValue();
4212}
4213
4215 DAGCombinerInfo &DCI) const {
4216 assert(N->getOpcode() == ISD::MUL);
4217 EVT VT = N->getValueType(0);
4218
4219 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4220 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4221 // unnecessarily). isDivergent() is used as an approximation of whether the
4222 // value is in an SGPR.
4223 if (!N->isDivergent())
4224 return SDValue();
4225
4226 unsigned Size = VT.getSizeInBits();
4227 if (VT.isVector() || Size > 64)
4228 return SDValue();
4229
4230 SelectionDAG &DAG = DCI.DAG;
4231 SDLoc DL(N);
4232
4233 SDValue N0 = N->getOperand(0);
4234 SDValue N1 = N->getOperand(1);
4235
4236 // Undo InstCombine canonicalize X * (Y + 1) -> X * Y + X to enable mad
4237 // matching.
4238
4239 // mul x, (add y, 1) -> add (mul x, y), x
4240 auto IsFoldableAdd = [](SDValue V) -> SDValue {
4241 SDValue AddOp = getAddOneOp(V.getNode());
4242 if (!AddOp)
4243 return SDValue();
4244
4245 if (V.hasOneUse() || all_of(V->uses(), [](const SDNode *U) -> bool {
4246 return U->getOpcode() == ISD::MUL;
4247 }))
4248 return AddOp;
4249
4250 return SDValue();
4251 };
4252
4253 // FIXME: The selection pattern is not properly checking for commuted
4254 // operands, so we have to place the mul in the LHS
4255 if (SDValue MulOper = IsFoldableAdd(N0)) {
4256 SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N1, MulOper);
4257 return DAG.getNode(ISD::ADD, DL, VT, MulVal, N1);
4258 }
4259
4260 if (SDValue MulOper = IsFoldableAdd(N1)) {
4261 SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N0, MulOper);
4262 return DAG.getNode(ISD::ADD, DL, VT, MulVal, N0);
4263 }
4264
4265 // There are i16 integer mul/mad.
4266 if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16))
4267 return SDValue();
4268
4269 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
4270 // in the source into any_extends if the result of the mul is truncated. Since
4271 // we can assume the high bits are whatever we want, use the underlying value
4272 // to avoid the unknown high bits from interfering.
4273 if (N0.getOpcode() == ISD::ANY_EXTEND)
4274 N0 = N0.getOperand(0);
4275
4276 if (N1.getOpcode() == ISD::ANY_EXTEND)
4277 N1 = N1.getOperand(0);
4278
4279 SDValue Mul;
4280
4281 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
4282 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4283 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4284 Mul = getMul24(DAG, DL, N0, N1, Size, false);
4285 } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
4286 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4287 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4288 Mul = getMul24(DAG, DL, N0, N1, Size, true);
4289 } else {
4290 return SDValue();
4291 }
4292
4293 // We need to use sext even for MUL_U24, because MUL_U24 is used
4294 // for signed multiply of 8 and 16-bit types.
4295 return DAG.getSExtOrTrunc(Mul, DL, VT);
4296}
4297
4298SDValue
4300 DAGCombinerInfo &DCI) const {
4301 if (N->getValueType(0) != MVT::i32)
4302 return SDValue();
4303
4304 SelectionDAG &DAG = DCI.DAG;
4305 SDLoc DL(N);
4306
4307 SDValue N0 = N->getOperand(0);
4308 SDValue N1 = N->getOperand(1);
4309
4310 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
4311 // in the source into any_extends if the result of the mul is truncated. Since
4312 // we can assume the high bits are whatever we want, use the underlying value
4313 // to avoid the unknown high bits from interfering.
4314 if (N0.getOpcode() == ISD::ANY_EXTEND)
4315 N0 = N0.getOperand(0);
4316 if (N1.getOpcode() == ISD::ANY_EXTEND)
4317 N1 = N1.getOperand(0);
4318
4319 // Try to use two fast 24-bit multiplies (one for each half of the result)
4320 // instead of one slow extending multiply.
4321 unsigned LoOpcode, HiOpcode;
4322 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
4323 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4324 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4325 LoOpcode = AMDGPUISD::MUL_U24;
4326 HiOpcode = AMDGPUISD::MULHI_U24;
4327 } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
4328 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4329 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4330 LoOpcode = AMDGPUISD::MUL_I24;
4331 HiOpcode = AMDGPUISD::MULHI_I24;
4332 } else {
4333 return SDValue();
4334 }
4335
4336 SDValue Lo = DAG.getNode(LoOpcode, DL, MVT::i32, N0, N1);
4337 SDValue Hi = DAG.getNode(HiOpcode, DL, MVT::i32, N0, N1);
4338 DCI.CombineTo(N, Lo, Hi);
4339 return SDValue(N, 0);
4340}
4341
4343 DAGCombinerInfo &DCI) const {
4344 EVT VT = N->getValueType(0);
4345
4346 if (!Subtarget->hasMulI24() || VT.isVector())
4347 return SDValue();
4348
4349 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4350 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4351 // unnecessarily). isDivergent() is used as an approximation of whether the
4352 // value is in an SGPR.
4353 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
4354 // valu op anyway)
4355 if (Subtarget->hasSMulHi() && !N->isDivergent())
4356 return SDValue();
4357
4358 SelectionDAG &DAG = DCI.DAG;
4359 SDLoc DL(N);
4360
4361 SDValue N0 = N->getOperand(0);
4362 SDValue N1 = N->getOperand(1);
4363
4364 if (!isI24(N0, DAG) || !isI24(N1, DAG))
4365 return SDValue();
4366
4367 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4368 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4369
4370 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1);
4371 DCI.AddToWorklist(Mulhi.getNode());
4372 return DAG.getSExtOrTrunc(Mulhi, DL, VT);
4373}
4374
4376 DAGCombinerInfo &DCI) const {
4377 EVT VT = N->getValueType(0);
4378
4379 if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32)
4380 return SDValue();
4381
4382 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4383 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4384 // unnecessarily). isDivergent() is used as an approximation of whether the
4385 // value is in an SGPR.
4386 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
4387 // valu op anyway)
4388 if (Subtarget->hasSMulHi() && !N->isDivergent())
4389 return SDValue();
4390
4391 SelectionDAG &DAG = DCI.DAG;
4392 SDLoc DL(N);
4393
4394 SDValue N0 = N->getOperand(0);
4395 SDValue N1 = N->getOperand(1);
4396
4397 if (!isU24(N0, DAG) || !isU24(N1, DAG))
4398 return SDValue();
4399
4400 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4401 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4402
4403 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1);
4404 DCI.AddToWorklist(Mulhi.getNode());
4405 return DAG.getZExtOrTrunc(Mulhi, DL, VT);
4406}
4407
4408SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG,
4409 SDValue Op,
4410 const SDLoc &DL,
4411 unsigned Opc) const {
4412 EVT VT = Op.getValueType();
4413 EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT);
4414 if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() &&
4415 LegalVT != MVT::i16))
4416 return SDValue();
4417
4418 if (VT != MVT::i32)
4419 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op);
4420
4421 SDValue FFBX = DAG.getNode(Opc, DL, MVT::i32, Op);
4422 if (VT != MVT::i32)
4423 FFBX = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBX);
4424
4425 return FFBX;
4426}
4427
4428// The native instructions return -1 on 0 input. Optimize out a select that
4429// produces -1 on 0.
4430//
4431// TODO: If zero is not undef, we could also do this if the output is compared
4432// against the bitwidth.
4433//
4434// TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
4436 SDValue LHS, SDValue RHS,
4437 DAGCombinerInfo &DCI) const {
4438 if (!isNullConstant(Cond.getOperand(1)))
4439 return SDValue();
4440
4441 SelectionDAG &DAG = DCI.DAG;
4442 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
4443 SDValue CmpLHS = Cond.getOperand(0);
4444
4445 // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
4446 // select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x
4447 if (CCOpcode == ISD::SETEQ &&
4448 (isCtlzOpc(RHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&
4449 RHS.getOperand(0) == CmpLHS && isAllOnesConstant(LHS)) {
4450 unsigned Opc =
4452 return getFFBX_U32(DAG, CmpLHS, SL, Opc);
4453 }
4454
4455 // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
4456 // select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x
4457 if (CCOpcode == ISD::SETNE &&
4458 (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(LHS.getOpcode())) &&
4459 LHS.getOperand(0) == CmpLHS && isAllOnesConstant(RHS)) {
4460 unsigned Opc =
4462
4463 return getFFBX_U32(DAG, CmpLHS, SL, Opc);
4464 }
4465
4466 return SDValue();
4467}
4468
4470 unsigned Op,
4471 const SDLoc &SL,
4472 SDValue Cond,
4473 SDValue N1,
4474 SDValue N2) {
4475 SelectionDAG &DAG = DCI.DAG;
4476 EVT VT = N1.getValueType();
4477
4478 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond,
4479 N1.getOperand(0), N2.getOperand(0));
4480 DCI.AddToWorklist(NewSelect.getNode());
4481 return DAG.getNode(Op, SL, VT, NewSelect);
4482}
4483
4484// Pull a free FP operation out of a select so it may fold into uses.
4485//
4486// select c, (fneg x), (fneg y) -> fneg (select c, x, y)
4487// select c, (fneg x), k -> fneg (select c, x, (fneg k))
4488//
4489// select c, (fabs x), (fabs y) -> fabs (select c, x, y)
4490// select c, (fabs x), +k -> fabs (select c, x, k)
4491SDValue
4493 SDValue N) const {
4494 SelectionDAG &DAG = DCI.DAG;
4495 SDValue Cond = N.getOperand(0);
4496 SDValue LHS = N.getOperand(1);
4497 SDValue RHS = N.getOperand(2);
4498
4499 EVT VT = N.getValueType();
4500 if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||
4501 (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
4503 return SDValue();
4504
4505 return distributeOpThroughSelect(DCI, LHS.getOpcode(),
4506 SDLoc(N), Cond, LHS, RHS);
4507 }
4508
4509 bool Inv = false;
4510 if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) {
4511 std::swap(LHS, RHS);
4512 Inv = true;
4513 }
4514
4515 // TODO: Support vector constants.
4516 ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
4517 if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS &&
4518 !selectSupportsSourceMods(N.getNode())) {
4519 SDLoc SL(N);
4520 // If one side is an fneg/fabs and the other is a constant, we can push the
4521 // fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
4522 SDValue NewLHS = LHS.getOperand(0);
4523 SDValue NewRHS = RHS;
4524
4525 // Careful: if the neg can be folded up, don't try to pull it back down.
4526 bool ShouldFoldNeg = true;
4527
4528 if (NewLHS.hasOneUse()) {
4529 unsigned Opc = NewLHS.getOpcode();
4530 if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(NewLHS.getNode()))
4531 ShouldFoldNeg = false;
4532 if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
4533 ShouldFoldNeg = false;
4534 }
4535
4536 if (ShouldFoldNeg) {
4537 if (LHS.getOpcode() == ISD::FABS && CRHS->isNegative())
4538 return SDValue();
4539
4540 // We're going to be forced to use a source modifier anyway, there's no
4541 // point to pulling the negate out unless we can get a size reduction by
4542 // negating the constant.
4543 //
4544 // TODO: Generalize to use getCheaperNegatedExpression which doesn't know
4545 // about cheaper constants.
4546 if (NewLHS.getOpcode() == ISD::FABS &&
4548 return SDValue();
4549
4551 return SDValue();
4552
4553 if (LHS.getOpcode() == ISD::FNEG)
4554 NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4555
4556 if (Inv)
4557 std::swap(NewLHS, NewRHS);
4558
4559 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
4560 Cond, NewLHS, NewRHS);
4561 DCI.AddToWorklist(NewSelect.getNode());
4562 return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
4563 }
4564 }
4565
4566 return SDValue();
4567}
4568
4570 DAGCombinerInfo &DCI) const {
4571 if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
4572 return Folded;
4573
4574 SDValue Cond = N->getOperand(0);
4575 if (Cond.getOpcode() != ISD::SETCC)
4576 return SDValue();
4577
4578 EVT VT = N->getValueType(0);
4579 SDValue LHS = Cond.getOperand(0);
4580 SDValue RHS = Cond.getOperand(1);
4581 SDValue CC = Cond.getOperand(2);
4582
4583 SDValue True = N->getOperand(1);
4584 SDValue False = N->getOperand(2);
4585
4586 if (Cond.hasOneUse()) { // TODO: Look for multiple select uses.
4587 SelectionDAG &DAG = DCI.DAG;
4588 if (DAG.isConstantValueOfAnyType(True) &&
4589 !DAG.isConstantValueOfAnyType(False)) {
4590 // Swap cmp + select pair to move constant to false input.
4591 // This will allow using VOPC cndmasks more often.
4592 // select (setcc x, y), k, x -> select (setccinv x, y), x, k
4593
4594 SDLoc SL(N);
4595 ISD::CondCode NewCC =
4596 getSetCCInverse(cast<CondCodeSDNode>(CC)->get(), LHS.getValueType());
4597
4598 SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC);
4599 return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True);
4600 }
4601
4602 if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) {
4604 = combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
4605 // Revisit this node so we can catch min3/max3/med3 patterns.
4606 //DCI.AddToWorklist(MinMax.getNode());
4607 return MinMax;
4608 }
4609 }
4610
4611 // There's no reason to not do this if the condition has other uses.
4612 return performCtlz_CttzCombine(SDLoc(N), Cond, True, False, DCI);
4613}
4614
4615static bool isInv2Pi(const APFloat &APF) {
4616 static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118));
4617 static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983));
4618 static const APFloat KF64(APFloat::IEEEdouble(), APInt(64, 0x3fc45f306dc9c882));
4619
4620 return APF.bitwiseIsEqual(KF16) ||
4621 APF.bitwiseIsEqual(KF32) ||
4622 APF.bitwiseIsEqual(KF64);
4623}
4624
4625// 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an
4626// additional cost to negate them.
4629 if (C->isZero())
4630 return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
4631
4632 if (Subtarget->hasInv2PiInlineImm() && isInv2Pi(C->getValueAPF()))
4633 return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
4634
4636}
4637
4641 return false;
4642}
4643
4647 return false;
4648}
4649
4650static unsigned inverseMinMax(unsigned Opc) {
4651 switch (Opc) {
4652 case ISD::FMAXNUM:
4653 return ISD::FMINNUM;
4654 case ISD::FMINNUM:
4655 return ISD::FMAXNUM;
4656 case ISD::FMAXNUM_IEEE:
4657 return ISD::FMINNUM_IEEE;
4658 case ISD::FMINNUM_IEEE:
4659 return ISD::FMAXNUM_IEEE;
4660 case ISD::FMAXIMUM:
4661 return ISD::FMINIMUM;
4662 case ISD::FMINIMUM:
4663 return ISD::FMAXIMUM;
4668 default:
4669 llvm_unreachable("invalid min/max opcode");
4670 }
4671}
4672
4673/// \return true if it's profitable to try to push an fneg into its source
4674/// instruction.
4676 // If the input has multiple uses and we can either fold the negate down, or
4677 // the other uses cannot, give up. This both prevents unprofitable
4678 // transformations and infinite loops: we won't repeatedly try to fold around
4679 // a negate that has no 'good' form.
4680 if (N0.hasOneUse()) {
4681 // This may be able to fold into the source, but at a code size cost. Don't
4682 // fold if the fold into the user is free.
4683 if (allUsesHaveSourceMods(N, 0))
4684 return false;
4685 } else {
4686 if (fnegFoldsIntoOp(N0.getNode()) &&
4688 return false;
4689 }
4690
4691 return true;
4692}
4693
4695 DAGCombinerInfo &DCI) const {
4696 SelectionDAG &DAG = DCI.DAG;
4697 SDValue N0 = N->getOperand(0);
4698 EVT VT = N->getValueType(0);
4699
4700 unsigned Opc = N0.getOpcode();
4701
4702 if (!shouldFoldFNegIntoSrc(N, N0))
4703 return SDValue();
4704
4705 SDLoc SL(N);
4706 switch (Opc) {
4707 case ISD::FADD: {
4708 if (!mayIgnoreSignedZero(N0))
4709 return SDValue();
4710
4711 // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
4712 SDValue LHS = N0.getOperand(0);
4713 SDValue RHS = N0.getOperand(1);
4714
4715 if (LHS.getOpcode() != ISD::FNEG)
4716 LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
4717 else
4718 LHS = LHS.getOperand(0);
4719
4720 if (RHS.getOpcode() != ISD::FNEG)
4721 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4722 else
4723 RHS = RHS.getOperand(0);
4724
4725 SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS, N0->getFlags());
4726 if (Res.getOpcode() != ISD::FADD)
4727 return SDValue(); // Op got folded away.
4728 if (!N0.hasOneUse())
4729 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
4730 return Res;
4731 }
4732 case ISD::FMUL:
4734 // (fneg (fmul x, y)) -> (fmul x, (fneg y))
4735 // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y))
4736 SDValue LHS = N0.getOperand(0);
4737 SDValue RHS = N0.getOperand(1);
4738
4739 if (LHS.getOpcode() == ISD::FNEG)
4740 LHS = LHS.getOperand(0);
4741 else if (RHS.getOpcode() == ISD::FNEG)
4742 RHS = RHS.getOperand(0);
4743 else
4744 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4745
4746 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS, N0->getFlags());
4747 if (Res.getOpcode() != Opc)
4748 return SDValue(); // Op got folded away.
4749 if (!N0.hasOneUse())
4750 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
4751 return Res;
4752 }
4753 case ISD::FMA:
4754 case ISD::FMAD: {
4755 // TODO: handle llvm.amdgcn.fma.legacy
4756 if (!mayIgnoreSignedZero(N0))
4757 return SDValue();
4758
4759 // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))
4760 SDValue LHS = N0.getOperand(0);
4761 SDValue MHS = N0.getOperand(1);
4762 SDValue RHS = N0.getOperand(2);
4763
4764 if (LHS.getOpcode() == ISD::FNEG)
4765 LHS = LHS.getOperand(0);
4766 else if (MHS.getOpcode() == ISD::FNEG)
4767 MHS = MHS.getOperand(0);
4768 else
4769 MHS = DAG.getNode(ISD::FNEG, SL, VT, MHS);
4770
4771 if (RHS.getOpcode() != ISD::FNEG)
4772 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4773 else
4774 RHS = RHS.getOperand(0);
4775
4776 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS);
4777 if (Res.getOpcode() != Opc)
4778 return SDValue(); // Op got folded away.
4779 if (!N0.hasOneUse())
4780 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
4781 return Res;
4782 }
4783 case ISD::FMAXNUM:
4784 case ISD::FMINNUM:
4785 case ISD::FMAXNUM_IEEE:
4786 case ISD::FMINNUM_IEEE:
4787 case ISD::FMINIMUM:
4788 case ISD::FMAXIMUM:
4791 // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)
4792 // fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y)
4793 // fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y)
4794 // fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y)
4795
4796 SDValue LHS = N0.getOperand(0);
4797 SDValue RHS = N0.getOperand(1);
4798
4799 // 0 doesn't have a negated inline immediate.
4800 // TODO: This constant check should be generalized to other operations.
4802 return SDValue();
4803
4804 SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
4805 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4806 unsigned Opposite = inverseMinMax(Opc);
4807
4808 SDValue Res = DAG.getNode(Opposite, SL, VT, NegLHS, NegRHS, N0->getFlags());
4809 if (Res.getOpcode() != Opposite)
4810 return SDValue(); // Op got folded away.
4811 if (!N0.hasOneUse())
4812 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
4813 return Res;
4814 }
4815 case AMDGPUISD::FMED3: {
4816 SDValue Ops[3];
4817 for (unsigned I = 0; I < 3; ++I)
4818 Ops[I] = DAG.getNode(ISD::FNEG, SL, VT, N0->getOperand(I), N0->getFlags());
4819
4820 SDValue Res = DAG.getNode(AMDGPUISD::FMED3, SL, VT, Ops, N0->getFlags());
4821 if (Res.getOpcode() != AMDGPUISD::FMED3)
4822 return SDValue(); // Op got folded away.
4823
4824 if (!N0.hasOneUse()) {
4825 SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Res);
4826 DAG.ReplaceAllUsesWith(N0, Neg);
4827
4828 for (SDNode *U : Neg->uses())
4829 DCI.AddToWorklist(U);
4830 }
4831
4832 return Res;
4833 }
4834 case ISD::FP_EXTEND:
4835 case ISD::FTRUNC:
4836 case ISD::FRINT:
4837 case ISD::FNEARBYINT: // XXX - Should fround be handled?
4838 case ISD::FROUNDEVEN:
4839 case ISD::FSIN:
4840 case ISD::FCANONICALIZE:
4841 case AMDGPUISD::RCP:
4844 case AMDGPUISD::SIN_HW: {
4845 SDValue CvtSrc = N0.getOperand(0);
4846 if (CvtSrc.getOpcode() == ISD::FNEG) {
4847 // (fneg (fp_extend (fneg x))) -> (fp_extend x)
4848 // (fneg (rcp (fneg x))) -> (rcp x)
4849 return DAG.getNode(Opc, SL, VT, CvtSrc.getOperand(0));
4850 }
4851
4852 if (!N0.hasOneUse())
4853 return SDValue();
4854
4855 // (fneg (fp_extend x)) -> (fp_extend (fneg x))
4856 // (fneg (rcp x)) -> (rcp (fneg x))
4857 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
4858 return DAG.getNode(Opc, SL, VT, Neg, N0->getFlags());
4859 }
4860 case ISD::FP_ROUND: {
4861 SDValue CvtSrc = N0.getOperand(0);
4862
4863 if (CvtSrc.getOpcode() == ISD::FNEG) {
4864 // (fneg (fp_round (fneg x))) -> (fp_round x)
4865 return DAG.getNode(ISD::FP_ROUND, SL, VT,
4866 CvtSrc.getOperand(0), N0.getOperand(1));
4867 }
4868
4869 if (!N0.hasOneUse())
4870 return SDValue();
4871
4872 // (fneg (fp_round x)) -> (fp_round (fneg x))
4873 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
4874 return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1));
4875 }
4876 case ISD::FP16_TO_FP: {
4877 // v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal
4878 // f16, but legalization of f16 fneg ends up pulling it out of the source.
4879 // Put the fneg back as a legal source operation that can be matched later.
4880 SDLoc SL(N);
4881
4882 SDValue Src = N0.getOperand(0);
4883 EVT SrcVT = Src.getValueType();
4884
4885 // fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000)
4886 SDValue IntFNeg = DAG.getNode(ISD::XOR, SL, SrcVT, Src,
4887 DAG.getConstant(0x8000, SL, SrcVT));
4888 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFNeg);
4889 }
4890 case ISD::SELECT: {
4891 // fneg (select c, a, b) -> select c, (fneg a), (fneg b)
4892 // TODO: Invert conditions of foldFreeOpFromSelect
4893 return SDValue();
4894 }
4895 case ISD::BITCAST: {
4896 SDLoc SL(N);
4897 SDValue BCSrc = N0.getOperand(0);
4898 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
4899 SDValue HighBits = BCSrc.getOperand(BCSrc.getNumOperands() - 1);
4900 if (HighBits.getValueType().getSizeInBits() != 32 ||
4901 !fnegFoldsIntoOp(HighBits.getNode()))
4902 return SDValue();
4903
4904 // f64 fneg only really needs to operate on the high half of of the
4905 // register, so try to force it to an f32 operation to help make use of
4906 // source modifiers.
4907 //
4908 //
4909 // fneg (f64 (bitcast (build_vector x, y))) ->
4910 // f64 (bitcast (build_vector (bitcast i32:x to f32),
4911 // (fneg (bitcast i32:y to f32)))
4912
4913 SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::f32, HighBits);
4914 SDValue NegHi = DAG.getNode(ISD::FNEG, SL, MVT::f32, CastHi);
4915 SDValue CastBack =
4916 DAG.getNode(ISD::BITCAST, SL, HighBits.getValueType(), NegHi);
4917
4918 SmallVector<SDValue, 8> Ops(BCSrc->op_begin(), BCSrc->op_end());
4919 Ops.back() = CastBack;
4920 DCI.AddToWorklist(NegHi.getNode());
4921 SDValue Build =
4922 DAG.getNode(ISD::BUILD_VECTOR, SL, BCSrc.getValueType(), Ops);
4923 SDValue Result = DAG.getNode(ISD::BITCAST, SL, VT, Build);
4924
4925 if (!N0.hasOneUse())
4926 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Result));
4927 return Result;
4928 }
4929
4930 if (BCSrc.getOpcode() == ISD::SELECT && VT == MVT::f32 &&
4931 BCSrc.hasOneUse()) {
4932 // fneg (bitcast (f32 (select cond, i32:lhs, i32:rhs))) ->
4933 // select cond, (bitcast i32:lhs to f32), (bitcast i32:rhs to f32)
4934
4935 // TODO: Cast back result for multiple uses is beneficial in some cases.
4936
4937 SDValue LHS =
4938 DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(1));
4939 SDValue RHS =
4940 DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(2));
4941
4942 SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, LHS);
4943 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHS);
4944
4945 return DAG.getNode(ISD::SELECT, SL, MVT::f32, BCSrc.getOperand(0), NegLHS,
4946 NegRHS);
4947 }
4948
4949 return SDValue();
4950 }
4951 default:
4952 return SDValue();
4953 }
4954}
4955
4957 DAGCombinerInfo &DCI) const {
4958 SelectionDAG &DAG = DCI.DAG;
4959 SDValue N0 = N->getOperand(0);
4960
4961 if (!N0.hasOneUse())
4962 return SDValue();
4963
4964 switch (N0.getOpcode()) {
4965 case ISD::FP16_TO_FP: {
4966 assert(!Subtarget->has16BitInsts() && "should only see if f16 is illegal");
4967 SDLoc SL(N);
4968 SDValue Src = N0.getOperand(0);
4969 EVT SrcVT = Src.getValueType();
4970
4971 // fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff)
4972 SDValue IntFAbs = DAG.getNode(ISD::AND, SL, SrcVT, Src,
4973 DAG.getConstant(0x7fff, SL, SrcVT));
4974 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFAbs);
4975 }
4976 default:
4977 return SDValue();
4978 }
4979}
4980
4982 DAGCombinerInfo &DCI) const {
4983 const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
4984 if (!CFP)
4985 return SDValue();
4986
4987 // XXX - Should this flush denormals?
4988 const APFloat &Val = CFP->getValueAPF();
4989 APFloat One(Val.getSemantics(), "1.0");
4990 return DCI.DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0));
4991}
4992
4994 DAGCombinerInfo &DCI) const {
4995 SelectionDAG &DAG = DCI.DAG;
4996 SDLoc DL(N);
4997
4998 switch(N->getOpcode()) {
4999 default:
5000 break;
5001 case ISD::BITCAST: {
5002 EVT DestVT = N->getValueType(0);
5003
5004 // Push casts through vector builds. This helps avoid emitting a large
5005 // number of copies when materializing floating point vector constants.
5006 //
5007 // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) =>
5008 // vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y))
5009 if (DestVT.isVector()) {
5010 SDValue Src = N->getOperand(0);
5011 if (Src.getOpcode() == ISD::BUILD_VECTOR &&
5014 EVT SrcVT = Src.getValueType();
5015 unsigned NElts = DestVT.getVectorNumElements();
5016
5017 if (SrcVT.getVectorNumElements() == NElts) {
5018 EVT DestEltVT = DestVT.getVectorElementType();
5019
5020 SmallVector<SDValue, 8> CastedElts;
5021 SDLoc SL(N);
5022 for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) {
5023 SDValue Elt = Src.getOperand(I);
5024 CastedElts.push_back(DAG.getNode(ISD::BITCAST, DL, DestEltVT, Elt));
5025 }
5026
5027 return DAG.getBuildVector(DestVT, SL, CastedElts);
5028 }
5029 }
5030 }
5031
5032 if (DestVT.getSizeInBits() != 64 || !DestVT.isVector())
5033 break;
5034
5035 // Fold bitcasts of constants.
5036 //
5037 // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k)
5038 // TODO: Generalize and move to DAGCombiner
5039 SDValue Src = N->getOperand(0);
5040 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src)) {
5041 SDLoc SL(N);
5042 uint64_t CVal = C->getZExtValue();
5043 SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
5044 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
5045 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
5046 return DAG.getNode(ISD::BITCAST, SL, DestVT, BV);
5047 }
5048
5049 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Src)) {
5050 const APInt &Val = C->getValueAPF().bitcastToAPInt();
5051 SDLoc SL(N);
5052 uint64_t CVal = Val.getZExtValue();
5053 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
5054 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
5055 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
5056
5057 return DAG.getNode(ISD::BITCAST, SL, DestVT, Vec);
5058 }
5059
5060 break;
5061 }
5062 case ISD::SHL: {
5064 break;
5065
5066 return performShlCombine(N, DCI);
5067 }
5068 case ISD::SRL: {
5070 break;
5071
5072 return performSrlCombine(N, DCI);
5073 }
5074 case ISD::SRA: {
5076 break;
5077
5078 return performSraCombine(N, DCI);
5079 }
5080 case ISD::TRUNCATE:
5081 return performTruncateCombine(N, DCI);
5082 case ISD::MUL:
5083 return performMulCombine(N, DCI);
5084 case AMDGPUISD::MUL_U24:
5085 case AMDGPUISD::MUL_I24: {
5086 if (SDValue Simplified = simplifyMul24(N, DCI))
5087 return Simplified;
5088 break;
5089 }
5092 return simplifyMul24(N, DCI);
5093 case ISD::SMUL_LOHI:
5094 case ISD::UMUL_LOHI:
5095 return performMulLoHiCombine(N, DCI);
5096 case ISD::MULHS:
5097 return performMulhsCombine(N, DCI);
5098 case ISD::MULHU:
5099 return performMulhuCombine(N, DCI);
5100 case ISD::SELECT:
5101 return performSelectCombine(N, DCI);
5102 case ISD::FNEG:
5103 return performFNegCombine(N, DCI);
5104 case ISD::FABS:
5105 return performFAbsCombine(N, DCI);
5106 case AMDGPUISD::BFE_I32:
5107 case AMDGPUISD::BFE_U32: {
5108 assert(!N->getValueType(0).isVector() &&
5109 "Vector handling of BFE not implemented");
5110 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
5111 if (!Width)
5112 break;
5113
5114 uint32_t WidthVal = Width->getZExtValue() & 0x1f;
5115 if (WidthVal == 0)
5116 return DAG.getConstant(0, DL, MVT::i32);
5117
5118 ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
5119 if (!Offset)
5120 break;
5121
5122 SDValue BitsFrom = N->getOperand(0);
5123 uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;
5124
5125 bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
5126
5127 if (OffsetVal == 0) {
5128 // This is already sign / zero extended, so try to fold away extra BFEs.
5129 unsigned SignBits = Signed ? (32 - WidthVal + 1) : (32 - WidthVal);
5130
5131 unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom);
5132 if (OpSignBits >= SignBits)
5133 return BitsFrom;
5134
5135 EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal);
5136 if (Signed) {
5137 // This is a sign_extend_inreg. Replace it to take advantage of existing
5138 // DAG Combines. If not eliminated, we will match back to BFE during
5139 // selection.
5140
5141 // TODO: The sext_inreg of extended types ends, although we can could
5142 // handle them in a single BFE.
5143 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom,
5144 DAG.getValueType(SmallVT));
5145 }
5146
5147 return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT);
5148 }
5149
5150 if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(BitsFrom)) {
5151 if (Signed) {
5152 return constantFoldBFE<int32_t>(DAG,
5153 CVal->getSExtValue(),
5154 OffsetVal,
5155 WidthVal,
5156 DL);
5157 }
5158
5159 return constantFoldBFE<uint32_t>(DAG,
5160 CVal->getZExtValue(),
5161 OffsetVal,
5162 WidthVal,
5163 DL);
5164 }
5165
5166 if ((OffsetVal + WidthVal) >= 32 &&
5167 !(Subtarget->hasSDWA() && OffsetVal == 16 && WidthVal == 16)) {
5168 SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32);
5169 return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
5170 BitsFrom, ShiftVal);
5171 }
5172
5173 if (BitsFrom.hasOneUse()) {
5174 APInt Demanded = APInt::getBitsSet(32,
5175 OffsetVal,
5176 OffsetVal + WidthVal);
5177
5178 KnownBits Known;
5180 !DCI.isBeforeLegalizeOps());
5181 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
5182 if (TLI.ShrinkDemandedConstant(BitsFrom, Demanded, TLO) ||
5183 TLI.SimplifyDemandedBits(BitsFrom, Demanded, Known, TLO)) {
5184 DCI.CommitTargetLoweringOpt(TLO);
5185 }
5186 }
5187
5188 break;
5189 }
5190 case ISD::LOAD:
5191 return performLoadCombine(N, DCI);
5192 case ISD::STORE:
5193 return performStoreCombine(N, DCI);
5194 case AMDGPUISD::RCP:
5196 return performRcpCombine(N, DCI);
5197 case ISD::AssertZext:
5198 case ISD::AssertSext:
5199 return performAssertSZExtCombine(N, DCI);
5201 return performIntrinsicWOChainCombine(N, DCI);
5202 case AMDGPUISD::FMAD_FTZ: {
5203 SDValue N0 = N->getOperand(0);
5204 SDValue N1 = N->getOperand(1);
5205 SDValue N2 = N->getOperand(2);
5206 EVT VT = N->getValueType(0);
5207
5208 // FMAD_FTZ is a FMAD + flush denormals to zero.
5209 // We flush the inputs, the intermediate step, and the output.
5210 ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
5211 ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
5212 ConstantFPSDNode *N2CFP = dyn_cast<ConstantFPSDNode>(N2);
5213 if (N0CFP && N1CFP && N2CFP) {
5214 const auto FTZ = [](const APFloat &V) {
5215 if (V.isDenormal()) {
5216 APFloat Zero(V.getSemantics(), 0);
5217 return V.isNegative() ? -Zero : Zero;
5218 }
5219 return V;
5220 };
5221
5222 APFloat V0 = FTZ(N0CFP->getValueAPF());
5223 APFloat V1 = FTZ(N1CFP->getValueAPF());
5224 APFloat V2 = FTZ(N2CFP->getValueAPF());
5226 V0 = FTZ(V0);
5228 return DAG.getConstantFP(FTZ(V0), DL, VT);
5229 }
5230 break;
5231 }
5232 }
5233 return SDValue();
5234}
5235
5236//===----------------------------------------------------------------------===//
5237// Helper functions
5238//===----------------------------------------------------------------------===//
5239
5241 const TargetRegisterClass *RC,
5242 Register Reg, EVT VT,
5243 const SDLoc &SL,
5244 bool RawReg) const {
5247 Register VReg;
5248
5249 if (!MRI.isLiveIn(Reg)) {
5250 VReg = MRI.createVirtualRegister(RC);
5251 MRI.addLiveIn(Reg, VReg);
5252 } else {
5253 VReg = MRI.getLiveInVirtReg(Reg);
5254 }
5255
5256 if (RawReg)
5257 return DAG.getRegister(VReg, VT);
5258
5259 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT);
5260}
5261
5262// This may be called multiple times, and nothing prevents creating multiple
5263// objects at the same offset. See if we already defined this object.
5265 int64_t Offset) {
5266 for (int I = MFI.getObjectIndexBegin(); I < 0; ++I) {
5267 if (MFI.getObjectOffset(I) == Offset) {
5268 assert(MFI.getObjectSize(I) == Size);
5269 return I;
5270 }
5271 }
5272
5273 return MFI.CreateFixedObject(Size, Offset, true);
5274}
5275
5277 EVT VT,
5278 const SDLoc &SL,
5279 int64_t Offset) const {
5281 MachineFrameInfo &MFI = MF.getFrameInfo();
5282 int FI = getOrCreateFixedStackObject(MFI, VT.getStoreSize(), Offset);
5283
5284 auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset);
5285 SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32);
5286
5287 return DAG.getLoad(VT, SL, DAG.getEntryNode(), Ptr, SrcPtrInfo, Align(4),
5290}
5291
5293 const SDLoc &SL,
5294 SDValue Chain,
5295 SDValue ArgVal,
5296 int64_t Offset) const {
5300
5301 SDValue Ptr = DAG.getConstant(Offset, SL, MVT::i32);
5302 // Stores to the argument stack area are relative to the stack pointer.
5303 SDValue SP =
5304 DAG.getCopyFromReg(Chain, SL, Info->getStackPtrOffsetReg(), MVT::i32);
5305 Ptr = DAG.getNode(ISD::ADD, SL, MVT::i32, SP, Ptr);
5306 SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, Align(4),
5308 return Store;
5309}
5310
5312 const TargetRegisterClass *RC,
5313 EVT VT, const SDLoc &SL,
5314 const ArgDescriptor &Arg) const {
5315 assert(Arg && "Attempting to load missing argument");
5316
5317 SDValue V = Arg.isRegister() ?
5318 CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL) :
5319 loadStackInputValue(DAG, VT, SL, Arg.getStackOffset());
5320
5321 if (!Arg.isMasked())
5322 return V;
5323
5324 unsigned Mask = Arg.getMask();
5325 unsigned Shift = llvm::countr_zero<unsigned>(Mask);
5326 V = DAG.getNode(ISD::SRL, SL, VT, V,
5327 DAG.getShiftAmountConstant(Shift, VT, SL));
5328 return DAG.getNode(ISD::AND, SL, VT, V,
5329 DAG.getConstant(Mask >> Shift, SL, VT));
5330}
5331
5333 uint64_t ExplicitKernArgSize, const ImplicitParameter Param) const {
5334 unsigned ExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
5335 const Align Alignment = Subtarget->getAlignmentForImplicitArgPtr();
5336 uint64_t ArgOffset =
5337 alignTo(ExplicitKernArgSize, Alignment) + ExplicitArgOffset;
5338 switch (Param) {
5339 case FIRST_IMPLICIT:
5340 return ArgOffset;
5341 case PRIVATE_BASE:
5343 case SHARED_BASE:
5344 return ArgOffset + AMDGPU::ImplicitArg::SHARED_BASE_OFFSET;
5345 case QUEUE_PTR:
5346 return ArgOffset + AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET;
5347 }
5348 llvm_unreachable("unexpected implicit parameter type");
5349}
5350
5352 const MachineFunction &MF, const ImplicitParameter Param) const {
5355}
5356
5357#define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node;
5358
5359const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
5360 switch ((AMDGPUISD::NodeType)Opcode) {
5361 case AMDGPUISD::FIRST_NUMBER: break;
5362 // AMDIL DAG nodes
5363 NODE_NAME_CASE(UMUL);
5364 NODE_NAME_CASE(BRANCH_COND);
5365
5366 // AMDGPU DAG nodes
5367 NODE_NAME_CASE(IF)
5368 NODE_NAME_CASE(ELSE)
5369 NODE_NAME_CASE(LOOP)
5370 NODE_NAME_CASE(CALL)
5371 NODE_NAME_CASE(TC_RETURN)
5372 NODE_NAME_CASE(TC_RETURN_GFX)
5373 NODE_NAME_CASE(TC_RETURN_CHAIN)
5374 NODE_NAME_CASE(TRAP)
5375 NODE_NAME_CASE(RET_GLUE)
5376 NODE_NAME_CASE(WAVE_ADDRESS)
5377 NODE_NAME_CASE(RETURN_TO_EPILOG)
5378 NODE_NAME_CASE(ENDPGM)
5379 NODE_NAME_CASE(ENDPGM_TRAP)
5380 NODE_NAME_CASE(SIMULATED_TRAP)
5381 NODE_NAME_CASE(DWORDADDR)
5382 NODE_NAME_CASE(FRACT)
5383 NODE_NAME_CASE(SETCC)
5384 NODE_NAME_CASE(SETREG)
5385 NODE_NAME_CASE(DENORM_MODE)
5386 NODE_NAME_CASE(FMA_W_CHAIN)
5387 NODE_NAME_CASE(FMUL_W_CHAIN)
5388 NODE_NAME_CASE(CLAMP)
5389 NODE_NAME_CASE(COS_HW)
5390 NODE_NAME_CASE(SIN_HW)
5391 NODE_NAME_CASE(FMAX_LEGACY)
5392 NODE_NAME_CASE(FMIN_LEGACY)
5393 NODE_NAME_CASE(FMAX3)
5394 NODE_NAME_CASE(SMAX3)
5395 NODE_NAME_CASE(UMAX3)
5396 NODE_NAME_CASE(FMIN3)
5397 NODE_NAME_CASE(SMIN3)
5398 NODE_NAME_CASE(UMIN3)
5399 NODE_NAME_CASE(FMED3)
5400 NODE_NAME_CASE(SMED3)
5401 NODE_NAME_CASE(UMED3)
5402 NODE_NAME_CASE(FMAXIMUM3)
5403 NODE_NAME_CASE(FMINIMUM3)
5404 NODE_NAME_CASE(FDOT2)
5405 NODE_NAME_CASE(URECIP)
5406 NODE_NAME_CASE(DIV_SCALE)
5407 NODE_NAME_CASE(DIV_FMAS)
5408 NODE_NAME_CASE(DIV_FIXUP)
5409 NODE_NAME_CASE(FMAD_FTZ)
5410 NODE_NAME_CASE(RCP)
5411 NODE_NAME_CASE(RSQ)
5412 NODE_NAME_CASE(RCP_LEGACY)
5413 NODE_NAME_CASE(RCP_IFLAG)
5414 NODE_NAME_CASE(LOG)
5415 NODE_NAME_CASE(EXP)
5416 NODE_NAME_CASE(FMUL_LEGACY)
5417 NODE_NAME_CASE(RSQ_CLAMP)
5418 NODE_NAME_CASE(FP_CLASS)
5419 NODE_NAME_CASE(DOT4)
5420 NODE_NAME_CASE(CARRY)
5421 NODE_NAME_CASE(BORROW)
5422 NODE_NAME_CASE(BFE_U32)
5423 NODE_NAME_CASE(BFE_I32)
5424 NODE_NAME_CASE(BFI)
5425 NODE_NAME_CASE(BFM)
5426 NODE_NAME_CASE(FFBH_U32)
5427 NODE_NAME_CASE(FFBH_I32)
5428 NODE_NAME_CASE(FFBL_B32)
5429 NODE_NAME_CASE(MUL_U24)
5430 NODE_NAME_CASE(MUL_I24)
5431 NODE_NAME_CASE(MULHI_U24)
5432 NODE_NAME_CASE(MULHI_I24)
5433 NODE_NAME_CASE(MAD_U24)
5434 NODE_NAME_CASE(MAD_I24)
5435 NODE_NAME_CASE(MAD_I64_I32)
5436 NODE_NAME_CASE(MAD_U64_U32)
5437 NODE_NAME_CASE(PERM)
5438 NODE_NAME_CASE(TEXTURE_FETCH)
5439 NODE_NAME_CASE(R600_EXPORT)
5440 NODE_NAME_CASE(CONST_ADDRESS)
5441 NODE_NAME_CASE(REGISTER_LOAD)
5442 NODE_NAME_CASE(REGISTER_STORE)
5443 NODE_NAME_CASE(SAMPLE)
5444 NODE_NAME_CASE(SAMPLEB)
5445 NODE_NAME_CASE(SAMPLED)
5446 NODE_NAME_CASE(SAMPLEL)
5447 NODE_NAME_CASE(CVT_F32_UBYTE0)
5448 NODE_NAME_CASE(CVT_F32_UBYTE1)
5449 NODE_NAME_CASE(CVT_F32_UBYTE2)
5450 NODE_NAME_CASE(CVT_F32_UBYTE3)
5451 NODE_NAME_CASE(CVT_PKRTZ_F16_F32)
5452 NODE_NAME_CASE(CVT_PKNORM_I16_F32)
5453 NODE_NAME_CASE(CVT_PKNORM_U16_F32)
5454 NODE_NAME_CASE(CVT_PK_I16_I32)
5455 NODE_NAME_CASE(CVT_PK_U16_U32)
5456 NODE_NAME_CASE(FP_TO_FP16)
5457 NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
5458 NODE_NAME_CASE(CONST_DATA_PTR)
5459 NODE_NAME_CASE(PC_ADD_REL_OFFSET)
5461 NODE_NAME_CASE(FPTRUNC_ROUND_UPWARD)
5462 NODE_NAME_CASE(FPTRUNC_ROUND_DOWNWARD)
5463 NODE_NAME_CASE(DUMMY_CHAIN)
5465 NODE_NAME_CASE(LOAD_D16_HI)
5466 NODE_NAME_CASE(LOAD_D16_LO)
5467 NODE_NAME_CASE(LOAD_D16_HI_I8)
5468 NODE_NAME_CASE(LOAD_D16_HI_U8)
5469 NODE_NAME_CASE(LOAD_D16_LO_I8)
5470 NODE_NAME_CASE(LOAD_D16_LO_U8)
5471 NODE_NAME_CASE(STORE_MSKOR)
5472 NODE_NAME_CASE(LOAD_CONSTANT)
5473 NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
5474 NODE_NAME_CASE(TBUFFER_STORE_FORMAT_D16)
5475 NODE_NAME_CASE(TBUFFER_LOAD_FORMAT)
5476 NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16)
5477 NODE_NAME_CASE(DS_ORDERED_COUNT)
5478 NODE_NAME_CASE(ATOMIC_CMP_SWAP)
5479 NODE_NAME_CASE(ATOMIC_LOAD_FMIN)
5480 NODE_NAME_CASE(ATOMIC_LOAD_FMAX)
5481 NODE_NAME_CASE(BUFFER_LOAD)
5482 NODE_NAME_CASE(BUFFER_LOAD_UBYTE)
5483 NODE_NAME_CASE(BUFFER_LOAD_USHORT)
5484 NODE_NAME_CASE(BUFFER_LOAD_BYTE)
5485 NODE_NAME_CASE(BUFFER_LOAD_SHORT)
5486 NODE_NAME_CASE(BUFFER_LOAD_FORMAT)
5487 NODE_NAME_CASE(BUFFER_LOAD_FORMAT_TFE)
5488 NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16)
5489 NODE_NAME_CASE(SBUFFER_LOAD)
5490 NODE_NAME_CASE(SBUFFER_LOAD_BYTE)
5491 NODE_NAME_CASE(SBUFFER_LOAD_UBYTE)
5492 NODE_NAME_CASE(SBUFFER_LOAD_SHORT)
5493 NODE_NAME_CASE(SBUFFER_LOAD_USHORT)
5494 NODE_NAME_CASE(BUFFER_STORE)
5495 NODE_NAME_CASE(BUFFER_STORE_BYTE)
5496 NODE_NAME_CASE(BUFFER_STORE_SHORT)
5497 NODE_NAME_CASE(BUFFER_STORE_FORMAT)
5498 NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16)
5499 NODE_NAME_CASE(BUFFER_ATOMIC_SWAP)
5500 NODE_NAME_CASE(BUFFER_ATOMIC_ADD)
5501 NODE_NAME_CASE(BUFFER_ATOMIC_SUB)
5502 NODE_NAME_CASE(BUFFER_ATOMIC_SMIN)
5503 NODE_NAME_CASE(BUFFER_ATOMIC_UMIN)
5504 NODE_NAME_CASE(BUFFER_ATOMIC_SMAX)
5505 NODE_NAME_CASE(BUFFER_ATOMIC_UMAX)
5506 NODE_NAME_CASE(BUFFER_ATOMIC_AND)
5507 NODE_NAME_CASE(BUFFER_ATOMIC_OR)
5508 NODE_NAME_CASE(BUFFER_ATOMIC_XOR)
5509 NODE_NAME_CASE(BUFFER_ATOMIC_INC)
5510 NODE_NAME_CASE(BUFFER_ATOMIC_DEC)
5511 NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP)
5512 NODE_NAME_CASE(BUFFER_ATOMIC_CSUB)
5513 NODE_NAME_CASE(BUFFER_ATOMIC_FADD)
5514 NODE_NAME_CASE(BUFFER_ATOMIC_FADD_BF16)
5515 NODE_NAME_CASE(BUFFER_ATOMIC_FMIN)
5516 NODE_NAME_CASE(BUFFER_ATOMIC_FMAX)
5517 NODE_NAME_CASE(BUFFER_ATOMIC_COND_SUB_U32)
5518
5520 }
5521 return nullptr;
5522}
5523
5525 SelectionDAG &DAG, int Enabled,
5526 int &RefinementSteps,
5527 bool &UseOneConstNR,
5528 bool Reciprocal) const {
5529 EVT VT = Operand.getValueType();
5530
5531 if (VT == MVT::f32) {
5532 RefinementSteps = 0;
5533 return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand);
5534 }
5535
5536 // TODO: There is also f64 rsq instruction, but the documentation is less
5537 // clear on its precision.
5538
5539 return SDValue();
5540}
5541
5543 SelectionDAG &DAG, int Enabled,
5544 int &RefinementSteps) const {
5545 EVT VT = Operand.getValueType();
5546
5547 if (VT == MVT::f32) {
5548 // Reciprocal, < 1 ulp error.
5549 //
5550 // This reciprocal approximation converges to < 0.5 ulp error with one
5551 // newton rhapson performed with two fused multiple adds (FMAs).
5552
5553 RefinementSteps = 0;
5554 return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand);
5555 }
5556
5557 // TODO: There is also f64 rcp instruction, but the documentation is less
5558 // clear on its precision.
5559
5560 return SDValue();
5561}
5562
5563static unsigned workitemIntrinsicDim(unsigned ID) {
5564 switch (ID) {
5565 case Intrinsic::amdgcn_workitem_id_x:
5566 return 0;
5567 case Intrinsic::amdgcn_workitem_id_y:
5568 return 1;
5569 case Intrinsic::amdgcn_workitem_id_z:
5570 return 2;
5571 default:
5572 llvm_unreachable("not a workitem intrinsic");
5573 }
5574}
5575
5577 const SDValue Op, KnownBits &Known,
5578 const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
5579
5580 Known.resetAll(); // Don't know anything.
5581
5582 unsigned Opc = Op.getOpcode();
5583
5584 switch (Opc) {
5585 default:
5586 break;
5587 case AMDGPUISD::CARRY:
5588 case AMDGPUISD::BORROW: {
5589 Known.Zero = APInt::getHighBitsSet(32, 31);
5590 break;
5591 }
5592
5593 case AMDGPUISD::BFE_I32:
5594 case AMDGPUISD::BFE_U32: {
5595 ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5596 if (!CWidth)
5597 return;
5598
5599 uint32_t Width = CWidth->getZExtValue() & 0x1f;
5600
5601 if (Opc == AMDGPUISD::BFE_U32)
5602 Known.Zero = APInt::getHighBitsSet(32, 32 - Width);
5603
5604 break;
5605 }
5606 case AMDGPUISD::FP_TO_FP16: {
5607 unsigned BitWidth = Known.getBitWidth();
5608
5609 // High bits are zero.
5611 break;
5612 }
5613 case AMDGPUISD::MUL_U24:
5614 case AMDGPUISD::MUL_I24: {
5615 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
5616 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
5617 unsigned TrailZ = LHSKnown.countMinTrailingZeros() +
5618 RHSKnown.countMinTrailingZeros();
5619 Known.Zero.setLowBits(std::min(TrailZ, 32u));
5620 // Skip extra check if all bits are known zeros.
5621 if (TrailZ >= 32)
5622 break;
5623
5624 // Truncate to 24 bits.
5625 LHSKnown = LHSKnown.trunc(24);
5626 RHSKnown = RHSKnown.trunc(24);
5627
5628 if (Opc == AMDGPUISD::MUL_I24) {
5629 unsigned LHSValBits = LHSKnown.countMaxSignificantBits();
5630 unsigned RHSValBits = RHSKnown.countMaxSignificantBits();
5631 unsigned MaxValBits = LHSValBits + RHSValBits;
5632 if (MaxValBits > 32)
5633 break;
5634 unsigned SignBits = 32 - MaxValBits + 1;
5635 bool LHSNegative = LHSKnown.isNegative();
5636 bool LHSNonNegative = LHSKnown.isNonNegative();
5637 bool LHSPositive = LHSKnown.isStrictlyPositive();
5638 bool RHSNegative = RHSKnown.isNegative();
5639 bool RHSNonNegative = RHSKnown.isNonNegative();
5640 bool RHSPositive = RHSKnown.isStrictlyPositive();
5641
5642 if ((LHSNonNegative && RHSNonNegative) || (LHSNegative && RHSNegative))
5643 Known.Zero.setHighBits(SignBits);
5644 else if ((LHSNegative && RHSPositive) || (LHSPositive && RHSNegative))
5645 Known.One.setHighBits(SignBits);
5646 } else {
5647 unsigned LHSValBits = LHSKnown.countMaxActiveBits();
5648 unsigned RHSValBits = RHSKnown.countMaxActiveBits();
5649 unsigned MaxValBits = LHSValBits + RHSValBits;
5650 if (MaxValBits >= 32)
5651 break;
5652 Known.Zero.setBitsFrom(MaxValBits);
5653 }
5654 break;
5655 }
5656 case AMDGPUISD::PERM: {
5657 ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5658 if (!CMask)
5659 return;
5660
5661 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
5662 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
5663 unsigned Sel = CMask->getZExtValue();
5664
5665 for (unsigned I = 0; I < 32; I += 8) {
5666 unsigned SelBits = Sel & 0xff;
5667 if (SelBits < 4) {
5668 SelBits *= 8;
5669 Known.One |= ((RHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
5670 Known.Zero |= ((RHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
5671 } else if (SelBits < 7) {
5672 SelBits = (SelBits & 3) * 8;
5673 Known.One |= ((LHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
5674 Known.Zero |= ((LHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
5675 } else if (SelBits == 0x0c) {
5676 Known.Zero |= 0xFFull << I;
5677 } else if (SelBits > 0x0c) {
5678 Known.One |= 0xFFull << I;
5679 }
5680 Sel >>= 8;
5681 }
5682 break;
5683 }
5685 Known.Zero.setHighBits(24);
5686 break;
5687 }
5689 Known.Zero.setHighBits(16);
5690 break;
5691 }
5692 case AMDGPUISD::LDS: {
5693 auto GA = cast<GlobalAddressSDNode>(Op.getOperand(0).getNode());
5694 Align Alignment = GA->getGlobal()->getPointerAlignment(DAG.getDataLayout());
5695
5696 Known.Zero.setHighBits(16);
5697 Known.Zero.setLowBits(Log2(Alignment));
5698 break;
5699 }
5700 case AMDGPUISD::SMIN3:
5701 case AMDGPUISD::SMAX3:
5702 case AMDGPUISD::SMED3:
5703 case AMDGPUISD::UMIN3:
5704 case AMDGPUISD::UMAX3:
5705 case AMDGPUISD::UMED3: {
5706 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
5707 if (Known2.isUnknown())
5708 break;
5709
5710 KnownBits Known1 = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
5711 if (Known1.isUnknown())
5712 break;
5713
5714 KnownBits Known0 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
5715 if (Known0.isUnknown())
5716 break;
5717
5718 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
5719 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
5720 Known.One = Known0.One & Known1.One & Known2.One;
5721 break;
5722 }
5724 unsigned IID = Op.getConstantOperandVal(0);
5725 switch (IID) {
5726 case Intrinsic::amdgcn_workitem_id_x:
5727 case Intrinsic::amdgcn_workitem_id_y:
5728 case Intrinsic::amdgcn_workitem_id_z: {
5729 unsigned MaxValue = Subtarget->getMaxWorkitemID(
5731 Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
5732 break;
5733 }
5734 default:
5735 break;
5736 }
5737 }
5738 }
5739}
5740
5742 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
5743 unsigned Depth) const {
5744 switch (Op.getOpcode()) {
5745 case AMDGPUISD::BFE_I32: {
5746 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5747 if (!Width)
5748 return 1;
5749
5750 unsigned SignBits = 32 - Width->getZExtValue() + 1;
5751 if (!isNullConstant(Op.getOperand(1)))
5752 return SignBits;
5753
5754 // TODO: Could probably figure something out with non-0 offsets.
5755 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
5756 return std::max(SignBits, Op0SignBits);
5757 }
5758
5759 case AMDGPUISD::BFE_U32: {
5760 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5761 return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1;
5762 }
5763
5764 case AMDGPUISD::CARRY:
5765 case AMDGPUISD::BORROW:
5766 return 31;
5768 return 25;
5770 return 17;
5772 return 24;
5774 return 16;
5776 return 16;
5777 case AMDGPUISD::SMIN3:
5778 case AMDGPUISD::SMAX3:
5779 case AMDGPUISD::SMED3:
5780 case AMDGPUISD::UMIN3:
5781 case AMDGPUISD::UMAX3:
5782 case AMDGPUISD::UMED3: {
5783 unsigned Tmp2 = DAG.ComputeNumSignBits(Op.getOperand(2), Depth + 1);
5784 if (Tmp2 == 1)
5785 return 1; // Early out.
5786
5787 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth + 1);
5788 if (Tmp1 == 1)
5789 return 1; // Early out.
5790
5791 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
5792 if (Tmp0 == 1)
5793 return 1; // Early out.
5794
5795 return std::min(Tmp0, std::min(Tmp1, Tmp2));
5796 }
5797 default:
5798 return 1;
5799 }
5800}
5801
5804 const APInt &DemandedElts, const MachineRegisterInfo &MRI,
5805 unsigned Depth) const {
5806 const MachineInstr *MI = MRI.getVRegDef(R);
5807 if (!MI)
5808 return 1;
5809
5810 // TODO: Check range metadata on MMO.
5811 switch (MI->getOpcode()) {
5812 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
5813 return 25;
5814 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
5815 return 17;
5816 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
5817 return 24;
5818 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
5819 return 16;
5820 case AMDGPU::G_AMDGPU_SMED3:
5821 case AMDGPU::G_AMDGPU_UMED3: {
5822 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
5823 unsigned Tmp2 = Analysis.computeNumSignBits(Src2, DemandedElts, Depth + 1);
5824 if (Tmp2 == 1)
5825 return 1;
5826 unsigned Tmp1 = Analysis.computeNumSignBits(Src1, DemandedElts, Depth + 1);
5827 if (Tmp1 == 1)
5828 return 1;
5829 unsigned Tmp0 = Analysis.computeNumSignBits(Src0, DemandedElts, Depth + 1);
5830 if (Tmp0 == 1)
5831 return 1;
5832 return std::min(Tmp0, std::min(Tmp1, Tmp2));
5833 }
5834 default:
5835 return 1;
5836 }
5837}
5838
5840 const SelectionDAG &DAG,
5841 bool SNaN,
5842 unsigned Depth) const {
5843 unsigned Opcode = Op.getOpcode();
5844 switch (Opcode) {
5847 if (SNaN)
5848 return true;
5849
5850 // TODO: Can check no nans on one of the operands for each one, but which
5851 // one?
5852 return false;
5853 }
5856 if (SNaN)
5857 return true;
5858 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
5859 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
5860 }
5861 case AMDGPUISD::FMED3:
5862 case AMDGPUISD::FMIN3:
5863 case AMDGPUISD::FMAX3:
5866 case AMDGPUISD::FMAD_FTZ: {
5867 if (SNaN)
5868 return true;
5869 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
5870 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
5871 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
5872 }
5877 return true;
5878
5879 case AMDGPUISD::RCP:
5880 case AMDGPUISD::RSQ:
5882 case AMDGPUISD::RSQ_CLAMP: {
5883 if (SNaN)
5884 return true;
5885
5886 // TODO: Need is known positive check.
5887 return false;
5888 }
5889 case ISD::FLDEXP:
5890 case AMDGPUISD::FRACT: {
5891 if (SNaN)
5892 return true;
5893 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
5894 }
5898 // TODO: Refine on operands.
5899 return SNaN;
5900 case AMDGPUISD::SIN_HW:
5901 case AMDGPUISD::COS_HW: {
5902 // TODO: Need check for infinity
5903 return SNaN;
5904 }
5906 unsigned IntrinsicID = Op.getConstantOperandVal(0);
5907 // TODO: Handle more intrinsics
5908 switch (IntrinsicID) {
5909 case Intrinsic::amdgcn_cubeid:
5910 return true;
5911
5912 case Intrinsic::amdgcn_frexp_mant: {
5913 if (SNaN)
5914 return true;
5915 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
5916 }
5917 case Intrinsic::amdgcn_cvt_pkrtz: {
5918 if (SNaN)
5919 return true;
5920 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
5921 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
5922 }
5923 case Intrinsic::amdgcn_rcp:
5924 case Intrinsic::amdgcn_rsq:
5925 case Intrinsic::amdgcn_rcp_legacy:
5926 case Intrinsic::amdgcn_rsq_legacy:
5927 case Intrinsic::amdgcn_rsq_clamp: {
5928 if (SNaN)
5929 return true;
5930
5931 // TODO: Need is known positive check.
5932 return false;
5933 }
5934 case Intrinsic::amdgcn_trig_preop:
5935 case Intrinsic::amdgcn_fdot2:
5936 // TODO: Refine on operand
5937 return SNaN;
5938 case Intrinsic::amdgcn_fma_legacy:
5939 if (SNaN)
5940 return true;
5941 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
5942 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1) &&
5943 DAG.isKnownNeverNaN(Op.getOperand(3), SNaN, Depth + 1);
5944 default:
5945 return false;
5946 }
5947 }
5948 default:
5949 return false;
5950 }
5951}
5952
5954 Register N0, Register N1) const {
5955 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
5956}
5957
5960 switch (RMW->getOperation()) {
5967 default: {
5968 if (auto *IntTy = dyn_cast<IntegerType>(RMW->getType())) {
5969 unsigned Size = IntTy->getBitWidth();
5970 if (Size == 32 || Size == 64)
5972 }
5973
5975 }
5976 }
5977}
5978
5979/// Whether it is profitable to sink the operands of an
5980/// Instruction I to the basic block of I.
5981/// This helps using several modifiers (like abs and neg) more often.
5983 Instruction *I, SmallVectorImpl<Use *> &Ops) const {
5984 using namespace PatternMatch;
5985
5986 for (auto &Op : I->operands()) {
5987 // Ensure we are not already sinking this operand.
5988 if (any_of(Ops, [&](Use *U) { return U->get() == Op.get(); }))
5989 continue;
5990
5991 if (match(&Op, m_FAbs(m_Value())) || match(&Op, m_FNeg(m_Value())))
5992 Ops.push_back(&Op);
5993 }
5994
5995 return !Ops.empty();
5996}
unsigned const MachineRegisterInfo * MRI
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static LLVM_READONLY bool hasSourceMods(const MachineInstr &MI)
static bool isInv2Pi(const APFloat &APF)
static LLVM_READONLY bool opMustUseVOP3Encoding(const MachineInstr &MI, const MachineRegisterInfo &MRI)
returns true if the operation will definitely need to use a 64-bit encoding, and thus will use a VOP3...
static unsigned inverseMinMax(unsigned Opc)
static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, SelectionDAG &DAG)
static unsigned workitemIntrinsicDim(unsigned ID)
static int getOrCreateFixedStackObject(MachineFrameInfo &MFI, unsigned Size, int64_t Offset)
static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset, uint32_t Width, const SDLoc &DL)
static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X, SDValue Y, SDValue C, SDNodeFlags Flags=SDNodeFlags())
static SDValue getAddOneOp(const SDNode *V)
If V is an add of a constant 1, returns the other operand.
#define NODE_NAME_CASE(node)
static LLVM_READONLY bool selectSupportsSourceMods(const SDNode *N)
Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the type for ISD::SELECT.
static cl::opt< bool > AMDGPUBypassSlowDiv("amdgpu-bypass-slow-div", cl::desc("Skip 64-bit divide for dynamic 32-bit values"), cl::init(true))
static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL, SDValue N0, SDValue N1, unsigned Size, bool Signed)
static bool fnegFoldsIntoOp(const SDNode *N)
static bool isI24(SDValue Op, SelectionDAG &DAG)
static bool isCttzOpc(unsigned Opc)
static bool isU24(SDValue Op, SelectionDAG &DAG)
static SDValue peekFPSignOps(SDValue Val)
static bool valueIsKnownNeverF32Denorm(SDValue Src)
Return true if it's known that Src can never be an f32 denormal value.
static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI, unsigned Op, const SDLoc &SL, SDValue Cond, SDValue N1, SDValue N2)
static SDValue peekFNeg(SDValue Val)
static SDValue simplifyMul24(SDNode *Node24, TargetLowering::DAGCombinerInfo &DCI)
static bool isCtlzOpc(unsigned Opc)
static LLVM_READNONE bool fnegFoldsIntoOpcode(unsigned Opc)
static bool hasVolatileUser(SDNode *Val)
Interface definition of the TargetLowering class that is common to all AMD GPUs.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
AMDGPU promote alloca to vector or LDS
Function Alias Analysis Results
block Block Frequency Analysis
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define LLVM_READNONE
Definition: Compiler.h:220
#define LLVM_READONLY
Definition: Compiler.h:227
static cl::opt< unsigned > CostThreshold("dfa-cost-threshold", cl::desc("Maximum cost accepted for the transformation"), cl::Hidden, cl::init(50))
static Error getAddrSpace(StringRef R, unsigned &AddrSpace)
Definition: DataLayout.cpp:266
uint64_t Size
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
Provides analysis for querying information about KnownBits during GISel passes.
IRTranslator LLVM IR MI
static LVOptions Options
Definition: LVOptions.cpp:25
#define I(x, y, z)
Definition: MD5.cpp:58
#define G(x, y, z)
Definition: MD5.cpp:56
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first found DebugLoc that has a DILocation, given a range of instructions.
LLVMContext & Context
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
const char LLVMTargetMachineRef TM
const SmallVectorImpl< MachineOperand > & Cond
#define CH(x, y, z)
Definition: SHA256.cpp:34
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool Enabled
Definition: Statistic.cpp:46
Value * RHS
Value * LHS
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static bool isUniformMMO(const MachineMemOperand *MMO)
static std::optional< uint32_t > getLDSAbsoluteAddress(const GlobalValue &GV)
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV)
bool hasFminFmaxLegacy() const
Align getAlignmentForImplicitArgPtr() const
bool hasMadMacF32Insts() const
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
bool has16BitInsts() const
bool hasFastFMAF32() const
unsigned getExplicitKernelArgOffset() const
Returns the offset in bytes from the start of the input buffer of the first explicit kernel argument.
static const AMDGPUSubtarget & get(const MachineFunction &MF)
bool hasInv2PiInlineImm() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue combineFMinMaxLegacy(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True, SDValue False, SDValue CC, DAGCombinerInfo &DCI) const
Generate Min/Max node.
unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
This method can be implemented by targets that want to expose additional information about sign bits ...
SDValue performMulhuCombine(SDNode *N, DAGCombinerInfo &DCI) const
EVT getTypeForExtReturn(LLVMContext &Context, EVT VT, ISD::NodeType ExtendKind) const override
Return the type that should be used to zero or sign extend a zeroext/signext integer return value.
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const
SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
bool storeOfVectorConstantIsCheap(bool IsZero, EVT MemVT, unsigned NumElem, unsigned AS) const override
Return true if it is expected to be cheaper to do a store of vector constant with the given size and ...
SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool shouldCombineMemoryType(EVT VT) const
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue performAssertSZExtCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isTruncateFree(EVT Src, EVT Dest) const override
bool aggressivelyPreferBuildVectorSources(EVT VecVT) const override
SDValue LowerFCEIL(SDValue Op, SelectionDAG &DAG) const
TargetLowering::NegatibleCost getConstantNegateCost(const ConstantFPSDNode *C) const
SDValue LowerFLOGUnsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, bool IsLog10, SDNodeFlags Flags) const
SDValue performMulhsCombine(SDNode *N, DAGCombinerInfo &DCI) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool isSDNodeAlwaysUniform(const SDNode *N) const override
bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to move this shift by a constant amount through its operand,...
SDValue LowerFREM(SDValue Op, SelectionDAG &DAG) const
Split a vector store into multiple scalar stores.
SDValue performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isCheapToSpeculateCtlz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic ctlz.
SDValue LowerSDIVREM(SDValue Op, SelectionDAG &DAG) const
bool isFNegFree(EVT VT) const override
Return true if an fneg operation is free to the point where it is never worthwhile to replace it with...
SDValue LowerFLOG10(SDValue Op, SelectionDAG &DAG) const
SDValue LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, bool Signed) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
bool isConstantCheaperToNegate(SDValue N) const
bool isReassocProfitable(MachineRegisterInfo &MRI, Register N0, Register N1) const override
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const
SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue performFNegCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
bool isConstantCostlierToNegate(SDValue N) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
SDValue LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const
SDValue lowerFEXP10Unsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, SDNodeFlags Flags) const
Emit approx-funcs appropriate lowering for exp10.
SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const
bool isCheapToSpeculateCttz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic cttz.
SDValue performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond, SDValue LHS, SDValue RHS, DAGCombinerInfo &DCI) const
SDValue performSraCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isSelectSupported(SelectSupportKind) const override
bool isZExtFree(Type *Src, Type *Dest) const override
Return true if any actual instruction that defines a value of type FromTy implicitly zero-extends the...
SDValue lowerFEXP2(SDValue Op, SelectionDAG &DAG) const
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
SDValue performSrlCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue lowerFEXP(SDValue Op, SelectionDAG &DAG) const
SDValue getIsLtSmallestNormal(SelectionDAG &DAG, SDValue Op, SDNodeFlags Flags) const
bool mayIgnoreSignedZero(SDValue Op) const
SDValue getIsFinite(SelectionDAG &DAG, SDValue Op, SDNodeFlags Flags) const
bool isLoadBitCastBeneficial(EVT, EVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const final
Return true if the following transform is beneficial: fold (conv (load x)) -> (load (conv*)x) On arch...
bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtType, EVT ExtVT) const override
Return true if it is profitable to reduce a load to a smaller type.
MVT getVectorIdxTy(const DataLayout &) const override
Returns the type to be used for the index operand of: ISD::INSERT_VECTOR_ELT, ISD::EXTRACT_VECTOR_ELT...
std::pair< SDValue, SDValue > splitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HighVT, SelectionDAG &DAG) const
Split a vector value into two parts of types LoVT and HiVT.
SDValue LowerFLOGCommon(SDValue Op, SelectionDAG &DAG) const
SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI, SDValue N) const
bool shouldSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Whether it is profitable to sink the operands of an Instruction I to the basic block of I.
SDValue LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG, bool Signed) const
bool isFAbsFree(EVT VT) const override
Return true if an fabs operation is free to the point where it is never worthwhile to replace it with...
SDValue loadStackInputValue(SelectionDAG &DAG, EVT VT, const SDLoc &SL, int64_t Offset) const
Similar to CreateLiveInRegister, except value maybe loaded from a stack slot rather than passed in a ...
bool isNarrowingProfitable(EVT SrcVT, EVT DestVT) const override
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
SDValue LowerFLOG2(SDValue Op, SelectionDAG &DAG) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &RefinementSteps, bool &UseOneConstNR, bool Reciprocal) const override
Hooks for building estimates in place of slower divisions and square roots.
unsigned computeNumSignBitsForTargetInstr(GISelKnownBits &Analysis, Register R, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
This method can be implemented by targets that want to expose additional information about sign bits ...
SDValue performTruncateCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const
static SDValue stripBitcast(SDValue Val)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
SDValue LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const
SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize, NegatibleCost &Cost, unsigned Depth) const override
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &RefinementSteps) const override
Return a reciprocal estimate value for the input operand.
AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI)
SDValue LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const
const char * getTargetNodeName(unsigned Opcode) const override
This method returns the name of a target specific DAG node.
SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
std::pair< SDValue, SDValue > getScaledLogInput(SelectionDAG &DAG, const SDLoc SL, SDValue Op, SDNodeFlags Flags) const
If denormal handling is required return the scaled input to FLOG2, and the check for denormal range.
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold=4)
SDValue LowerFROUNDEVEN(SDValue Op, SelectionDAG &DAG) const
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override
Returns true if the target can instruction select the specified FP immediate natively.
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
SDValue lowerFEXPUnsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, SDNodeFlags Flags) const
SDValue LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
bool ShouldShrinkFPConstant(EVT VT) const override
If true, then instruction selection should seek to shrink the FP constant of the specified type to a ...
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue getLoHalf64(SDValue Op, SelectionDAG &DAG) const
SDValue lowerCTLZResults(SDValue Op, SelectionDAG &DAG) const
SDValue performFAbsCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerFP_TO_INT64(SDValue Op, SelectionDAG &DAG, bool Signed) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
SDValue LowerFRINT(SDValue Op, SelectionDAG &DAG) const
SDValue performIntrinsicWOChainCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const
SDValue performMulLoHiCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
void LowerUDIVREM64(SDValue Op, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results) const
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
std::pair< EVT, EVT > getSplitDestVTs(const EVT &VT, SelectionDAG &DAG) const
Split a vector type into two parts.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
SDValue combineFMinMaxLegacyImpl(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True, SDValue False, SDValue CC, DAGCombinerInfo &DCI) const
bool bitwiseIsEqual(const APFloat &RHS) const
Definition: APFloat.h:1260
opStatus add(const APFloat &RHS, roundingMode RM)
Definition: APFloat.h:1042
const fltSemantics & getSemantics() const
Definition: APFloat.h:1303
opStatus multiply(const APFloat &RHS, roundingMode RM)
Definition: APFloat.h:1060
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
Definition: APFloat.h:1026
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition: APFloat.h:966
Class for arbitrary precision integers.
Definition: APInt.h:76
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1491
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition: APInt.h:1370
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition: APInt.h:1364
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition: APInt.h:236
bool ule(const APInt &RHS) const
Unsigned less or equal comparison.
Definition: APInt.h:1128
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition: APInt.h:284
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:274
void setLowBits(unsigned loBits)
Set the bottom loBits bits.
Definition: APInt.h:1367
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:748
@ FAdd
*p = old + v
Definition: Instructions.h:785
@ FSub
*p = old - v
Definition: Instructions.h:788
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
Definition: Instructions.h:796
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
Definition: Instructions.h:792
@ Nand
*p = ~(old & v)
Definition: Instructions.h:770
BinOp getOperation() const
Definition: Instructions.h:845
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
LLVMContext & getContext() const
void addLoc(const CCValAssign &V)
static CCValAssign getCustomMem(unsigned ValNo, MVT ValVT, int64_t Offset, MVT LocVT, LocInfo HTP)
const APFloat & getValueAPF() const
bool isNegative() const
Return true if the value is negative.
uint64_t getZExtValue() const
This class represents an Operation in the Expression.
bool print(raw_ostream &OS, DIDumpOptions DumpOpts, const DWARFExpression *Expr, DWARFUnit *U) const
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
Diagnostic information for unsupported feature in backend.
iterator_range< arg_iterator > args()
Definition: Function.h:838
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:263
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:656
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
Machine Value Type.
static auto integer_fixedlen_vector_valuetypes()
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
static auto integer_valuetypes()
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
int getObjectIndexBegin() const
Return the minimum frame object index.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Representation of each machine instruction.
Definition: MachineInstr.h:69
A description of a memory reference used in the backend.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOInvariant
The memory access always returns the same value (or traps).
Flags getFlags() const
Return the raw flags of the source value,.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
bool isSimple() const
Returns true if the memory operation is neither atomic or volatile.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getChain() const
bool isInvariant() const
EVT getMemoryVT() const
Return the type of the in-memory value.
LLVMContext & getContext() const
Get the global data context.
Definition: Module.h:301
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.h:293
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
iterator_range< use_iterator > uses()
SDNodeFlags getFlags() const
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
op_iterator op_end() const
op_iterator op_begin() const
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getOpcode() const
unsigned getNumOperands() const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
SIModeRegisterDefaults getMode() const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:225
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
unsigned ComputeMaxSignificantBits(SDValue Op, unsigned Depth=0) const
Get the upper bound on bit size for this Value Op as a signed integer.
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
Definition: SelectionDAG.h:551
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS)
Helper function to make it easier to build Select's if you just have operands and don't want to check...
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
Definition: SelectionDAG.h:478
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
Definition: SelectionDAG.h:828
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
Definition: SelectionDAG.h:472
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
SDValue getRegister(unsigned Reg, EVT VT)
bool isConstantValueOfAnyType(SDValue N) const
SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
static const fltSemantics & EVTToAPFloatSemantics(EVT VT)
Returns an APFloat semantics tag appropriate for the given type.
const TargetMachine & getTarget() const
Definition: SelectionDAG.h:473
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond)
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
SDValue getValueType(EVT)
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
bool isKnownNeverNaN(SDValue Op, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:676
unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:469
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, unsigned Reg, EVT VT)
Definition: SelectionDAG.h:799
SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
Definition: SelectionDAG.h:485
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
Definition: SelectionDAG.h:560
SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL, bool LegalTypes=true)
SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition: SelectionDAG.h:554
std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
bool equals(StringRef RHS) const
equals - Check for string equality, this is more efficient than compare() when the relative ordering ...
Definition: StringRef.h:164
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void addBypassSlowDiv(unsigned int SlowBitWidth, unsigned int FastBitWidth)
Tells the code generator which bitwidths to bypass.
void setMaxLargeFPConvertBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum fp convert the backend supports.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
SelectSupportKind
Enum that describes what type of support for selects the target has.
virtual bool allowsMisalignedMemoryAccesses(EVT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *=nullptr) const
Determine if the target supports unaligned memory accesses.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
virtual EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const
Return the ValueType of the result of SETCC operations.
EVT getShiftAmountTy(EVT LHSTy, const DataLayout &DL, bool LegalTypes=true) const
Returns the type for the shift amount of a shift opcode.
virtual EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const
For types supported by the target, this is an identity function.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
void setSupportsUnalignedAtomics(bool UnalignedSupported)
Sets whether unaligned atomic operations are supported.
void setLibcallName(RTLIB::Libcall Call, const char *Name)
Rename the default libcall routine name for the specified libcall.
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
virtual bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT) const
Return true if it is profitable to reduce a load to a smaller type.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
void setMinCmpXchgSizeInBits(unsigned SizeInBits)
Sets the minimum cmpxchg or ll/sc size supported by the backend.
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
unsigned GatherAllAliasesMaxDepth
Depth that GatherAllAliases should continue looking for chain dependencies when trying to find a more...
NegatibleCost
Enum that specifies when a float negation is beneficial.
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
void setHasMultipleConditionRegisters(bool hasManyRegs=true)
Tells the code generator that the target has multiple (allocatable) condition registers that can be u...
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
void setJumpIsExpensive(bool isExpensive=true)
Tells the code generator not to expand logic operations on comparison predicates into separate sequen...
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
bool ShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const
Check to see if the specified operand of the specified instruction is a constant integer.
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
virtual SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOps, bool OptForSize, NegatibleCost &Cost, unsigned Depth=0) const
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:76
TargetOptions Options
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:330
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
bool isIntrinsicAlwaysUniform(unsigned IntrID)
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
Definition: CallingConv.h:197
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
Definition: CallingConv.h:188
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ AMDGPU_Gfx
Used for AMD graphics targets.
Definition: CallingConv.h:232
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:249
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
Definition: CallingConv.h:206
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
Definition: CallingConv.h:191
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:245
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:194
@ Cold
Attempts to make code in the caller as efficient as possible under the assumption that the call is no...
Definition: CallingConv.h:47
@ SPIR_KERNEL
Used for SPIR kernel functions.
Definition: CallingConv.h:144
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
Definition: CallingConv.h:218
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
Definition: CallingConv.h:213
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition: ISDOpcodes.h:40
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:750
@ CTLZ_ZERO_UNDEF
Definition: ISDOpcodes.h:723
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:250
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition: ISDOpcodes.h:559
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:714
@ ConstantFP
Definition: ISDOpcodes.h:77
@ ADDC
Carry-setting nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:269
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition: ISDOpcodes.h:487
@ FMAXNUM_IEEE
Definition: ISDOpcodes.h:985
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:239
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1037
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition: ISDOpcodes.h:783
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:483
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:790
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition: ISDOpcodes.h:543
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:390
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:255
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
Definition: ISDOpcodes.h:913
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:903
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition: ISDOpcodes.h:229
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
Definition: ISDOpcodes.h:939
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:774
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition: ISDOpcodes.h:722
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:930
@ BRIND
BRIND - Indirect branch.
Definition: ISDOpcodes.h:1058
@ BR_JT
BR_JT - Jumptable branch.
Definition: ISDOpcodes.h:1062
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition: ISDOpcodes.h:500
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
Definition: ISDOpcodes.h:507
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:727
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition: ISDOpcodes.h:222
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:651
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:705
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:600
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition: ISDOpcodes.h:573
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
Definition: ISDOpcodes.h:984
@ EntryToken
EntryToken - This is the marker used to indicate the start of a region.
Definition: ISDOpcodes.h:47
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:535
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:203
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:780
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition: ISDOpcodes.h:742
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:971
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
Definition: ISDOpcodes.h:1047
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:798
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:674
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:888
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition: ISDOpcodes.h:736
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:303
@ INLINEASM_BR
INLINEASM_BR - Branching version of inline asm. Used by asm-goto.
Definition: ISDOpcodes.h:1103
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:990
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:836
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:680
@ TRAP
TRAP - Trapping instruction.
Definition: ISDOpcodes.h:1214
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:184
@ ADDE
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:279
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition: ISDOpcodes.h:524
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition: ISDOpcodes.h:52
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
Definition: ISDOpcodes.h:944
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:869
@ INLINEASM
INLINEASM - Represents an inline asm block.
Definition: ISDOpcodes.h:1100
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:786
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition: ISDOpcodes.h:61
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition: ISDOpcodes.h:493
@ AssertZext
Definition: ISDOpcodes.h:62
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:192
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition: ISDOpcodes.h:515
bool isNormalStore(const SDNode *N)
Returns true if the specified node is a non-truncating and unindexed store.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1529
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
Definition: ISDOpcodes.h:1509
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
m_Intrinsic_Ty< Opnd0 >::Ty m_FAbs(const Opnd0 &Op0)
Libcall
RTLIB::Libcall enum - This enum defines all of the runtime library calls the backend can emit.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
constexpr double ln2
Definition: MathExtras.h:33
constexpr double ln10
Definition: MathExtras.h:34
constexpr float log2ef
Definition: MathExtras.h:50
constexpr double log2e
Definition: MathExtras.h:35
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Offset
Definition: DWP.cpp:456
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
bool getAlign(const Function &F, unsigned index, unsigned &align)
ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:372
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition: bit.h:281
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:138
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:156
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:143
raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
CombineLevel
Definition: DAGCombine.h:15
@ AfterLegalizeDAG
Definition: DAGCombine.h:19
@ AfterLegalizeTypes
Definition: DAGCombine.h:17
@ Mul
Product of integers.
@ And
Bitwise or logical AND of integers.
@ Add
Sum of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
DWARFExpression::Operation Op
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition: Analysis.cpp:79
ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:191
@ DS_Warning
bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
APFloat neg(APFloat X)
Returns the negated value of the argument.
Definition: APFloat.h:1387
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition: Alignment.h:208
bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
static const fltSemantics & IEEEsingle() LLVM_READNONE
Definition: APFloat.cpp:249
static constexpr roundingMode rmNearestTiesToEven
Definition: APFloat.h:230
static const fltSemantics & IEEEdouble() LLVM_READNONE
Definition: APFloat.cpp:250
static const fltSemantics & IEEEhalf() LLVM_READNONE
Definition: APFloat.cpp:247
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
MCRegister getRegister() const
unsigned getStackOffset() const
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
static constexpr DenormalMode getPreserveSign()
Extended Value Type.
Definition: ValueTypes.h:34
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition: ValueTypes.h:380
EVT getPow2VectorType(LLVMContext &Context) const
Widens the length of the given vector EVT up to the nearest power of 2 and returns that type.
Definition: ValueTypes.h:462
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:136
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition: ValueTypes.h:73
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition: ValueTypes.h:120
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:146
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:358
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition: ValueTypes.h:233
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:370
EVT getHalfSizedIntegerVT(LLVMContext &Context) const
Finds the smallest simple value type that is greater than or equal to half the width of this EVT.
Definition: ValueTypes.h:415
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition: ValueTypes.h:455
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition: ValueTypes.h:397
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:306
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition: ValueTypes.h:64
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition: ValueTypes.h:366
EVT getRoundIntegerType(LLVMContext &Context) const
Rounds the bit-width of the given integer EVT up to the nearest power of two (and at least to eight),...
Definition: ValueTypes.h:404
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:167
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:313
bool bitsGE(EVT VT) const
Return true if this has no less bits than VT.
Definition: ValueTypes.h:282
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:318
bool isExtended() const
Test if the given EVT is extended (as opposed to being simple).
Definition: ValueTypes.h:141
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:326
bool bitsLE(EVT VT) const
Return true if this has no more bits than VT.
Definition: ValueTypes.h:298
bool isNonNegative() const
Returns true if this value is known to be non-negative.
Definition: KnownBits.h:104
unsigned countMinTrailingZeros() const
Returns the minimum number of trailing zero bits.
Definition: KnownBits.h:238
bool isUnknown() const
Returns true if we don't know any bits.
Definition: KnownBits.h:63
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
Definition: KnownBits.h:157
unsigned getBitWidth() const
Get the bit width of this value.
Definition: KnownBits.h:40
void resetAll()
Resets the known state of all bits.
Definition: KnownBits.h:71
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition: KnownBits.h:292
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition: KnownBits.h:244
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition: KnownBits.h:141
bool isStrictlyPositive() const
Returns true if this value is known to be positive.
Definition: KnownBits.h:110
bool isNegative() const
Returns true if this value is known to be negative.
Definition: KnownBits.h:101
unsigned countMaxSignificantBits() const
Returns the maximum number of bits needed to represent all possible signed values with these known bi...
Definition: KnownBits.h:265
This class contains a discriminated union of information about pointers in memory operands,...
bool isDereferenceable(unsigned Size, LLVMContext &C, const DataLayout &DL) const
Return true if memory region [V, V+Offset+Size) is known to be dereferenceable.
static MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
MachinePointerInfo getWithOffset(int64_t O) const
These are IR-level optimization flags that may be propagated to SDNodes.
void setAllowContract(bool b)
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO)
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...