LLVM 20.0.0git
SIISelLowering.cpp
Go to the documentation of this file.
1//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Custom DAG lowering for SI
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIISelLowering.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPUTargetMachine.h"
18#include "GCNSubtarget.h"
21#include "SIRegisterInfo.h"
22#include "llvm/ADT/APInt.h"
24#include "llvm/ADT/Statistic.h"
38#include "llvm/IR/IRBuilder.h"
40#include "llvm/IR/IntrinsicsAMDGPU.h"
41#include "llvm/IR/IntrinsicsR600.h"
44#include "llvm/Support/ModRef.h"
46#include <optional>
47
48using namespace llvm;
49
50#define DEBUG_TYPE "si-lower"
51
52STATISTIC(NumTailCalls, "Number of tail calls");
53
55 "amdgpu-disable-loop-alignment",
56 cl::desc("Do not align and prefetch loops"),
57 cl::init(false));
58
60 "amdgpu-use-divergent-register-indexing",
62 cl::desc("Use indirect register addressing for divergent indexes"),
63 cl::init(false));
64
67 return Info->getMode().FP32Denormals == DenormalMode::getPreserveSign();
68}
69
72 return Info->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign();
73}
74
75static unsigned findFirstFreeSGPR(CCState &CCInfo) {
76 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
77 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
78 if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
79 return AMDGPU::SGPR0 + Reg;
80 }
81 }
82 llvm_unreachable("Cannot allocate sgpr");
83}
84
86 const GCNSubtarget &STI)
87 : AMDGPUTargetLowering(TM, STI),
88 Subtarget(&STI) {
89 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
90 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
91
92 addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
93 addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
94
95 addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
96
97 const SIRegisterInfo *TRI = STI.getRegisterInfo();
98 const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class();
99
100 addRegisterClass(MVT::f64, V64RegClass);
101 addRegisterClass(MVT::v2f32, V64RegClass);
102 addRegisterClass(MVT::Untyped, V64RegClass);
103
104 addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
105 addRegisterClass(MVT::v3f32, TRI->getVGPRClassForBitWidth(96));
106
107 addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
108 addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
109
110 addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
111 addRegisterClass(MVT::v4f32, TRI->getVGPRClassForBitWidth(128));
112
113 addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
114 addRegisterClass(MVT::v5f32, TRI->getVGPRClassForBitWidth(160));
115
116 addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass);
117 addRegisterClass(MVT::v6f32, TRI->getVGPRClassForBitWidth(192));
118
119 addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass);
120 addRegisterClass(MVT::v3f64, TRI->getVGPRClassForBitWidth(192));
121
122 addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass);
123 addRegisterClass(MVT::v7f32, TRI->getVGPRClassForBitWidth(224));
124
125 addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
126 addRegisterClass(MVT::v8f32, TRI->getVGPRClassForBitWidth(256));
127
128 addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
129 addRegisterClass(MVT::v4f64, TRI->getVGPRClassForBitWidth(256));
130
131 addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass);
132 addRegisterClass(MVT::v9f32, TRI->getVGPRClassForBitWidth(288));
133
134 addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass);
135 addRegisterClass(MVT::v10f32, TRI->getVGPRClassForBitWidth(320));
136
137 addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass);
138 addRegisterClass(MVT::v11f32, TRI->getVGPRClassForBitWidth(352));
139
140 addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass);
141 addRegisterClass(MVT::v12f32, TRI->getVGPRClassForBitWidth(384));
142
143 addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
144 addRegisterClass(MVT::v16f32, TRI->getVGPRClassForBitWidth(512));
145
146 addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);
147 addRegisterClass(MVT::v8f64, TRI->getVGPRClassForBitWidth(512));
148
149 addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);
150 addRegisterClass(MVT::v16f64, TRI->getVGPRClassForBitWidth(1024));
151
152 if (Subtarget->has16BitInsts()) {
153 if (Subtarget->useRealTrue16Insts()) {
154 addRegisterClass(MVT::i16, &AMDGPU::VGPR_16RegClass);
155 addRegisterClass(MVT::f16, &AMDGPU::VGPR_16RegClass);
156 addRegisterClass(MVT::bf16, &AMDGPU::VGPR_16RegClass);
157 } else {
158 addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
159 addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
160 addRegisterClass(MVT::bf16, &AMDGPU::SReg_32RegClass);
161 }
162
163 // Unless there are also VOP3P operations, not operations are really legal.
164 addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass);
165 addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass);
166 addRegisterClass(MVT::v2bf16, &AMDGPU::SReg_32RegClass);
167 addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
168 addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
169 addRegisterClass(MVT::v4bf16, &AMDGPU::SReg_64RegClass);
170 addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass);
171 addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass);
172 addRegisterClass(MVT::v8bf16, &AMDGPU::SGPR_128RegClass);
173 addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass);
174 addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass);
175 addRegisterClass(MVT::v16bf16, &AMDGPU::SGPR_256RegClass);
176 addRegisterClass(MVT::v32i16, &AMDGPU::SGPR_512RegClass);
177 addRegisterClass(MVT::v32f16, &AMDGPU::SGPR_512RegClass);
178 addRegisterClass(MVT::v32bf16, &AMDGPU::SGPR_512RegClass);
179 }
180
181 addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
182 addRegisterClass(MVT::v32f32, TRI->getVGPRClassForBitWidth(1024));
183
185
186 // The boolean content concept here is too inflexible. Compares only ever
187 // really produce a 1-bit result. Any copy/extend from these will turn into a
188 // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as
189 // it's what most targets use.
192
193 // We need to custom lower vector stores from local memory
195 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
196 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
197 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
198 MVT::i1, MVT::v32i32},
199 Custom);
200
202 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
203 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
204 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
205 MVT::i1, MVT::v32i32},
206 Custom);
207
208 if (isTypeLegal(MVT::bf16)) {
209 for (unsigned Opc :
218 ISD::SETCC}) {
219 // FIXME: The promoted to type shouldn't need to be explicit
220 setOperationAction(Opc, MVT::bf16, Promote);
221 AddPromotedToType(Opc, MVT::bf16, MVT::f32);
222 }
223
225
227 AddPromotedToType(ISD::SELECT, MVT::bf16, MVT::i16);
228
232
233 // We only need to custom lower because we can't specify an action for bf16
234 // sources.
237 }
238
239 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
240 setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand);
241 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
242 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
243 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
244 setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
245 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
246 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
247 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
248 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
249 setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
250 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
251 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand);
252 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand);
253 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand);
254 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand);
255
256 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
257 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
258 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand);
259 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand);
260 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand);
261 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Expand);
262 setTruncStoreAction(MVT::v16i64, MVT::v16i32, Expand);
263
264 setOperationAction(ISD::GlobalAddress, {MVT::i32, MVT::i64}, Custom);
265
269 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
270
271 setOperationAction(ISD::FSQRT, {MVT::f32, MVT::f64}, Custom);
272
274 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand);
275
277 setOperationAction(ISD::SETCC, {MVT::v2i1, MVT::v4i1}, Expand);
278 AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
279
281 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
282 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
283 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
284 Expand);
286 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
287 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
288 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
289 Expand);
290
292 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
293 MVT::v3i16, MVT::v4i16, MVT::Other},
294 Custom);
295
298 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Expand);
299
301
303
305 Expand);
306
307#if 0
309#endif
310
311 // We only support LOAD/STORE and vector manipulation ops for vectors
312 // with > 4 elements.
313 for (MVT VT :
314 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
315 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
316 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
317 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
318 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
319 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
320 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
321 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
322 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
323 switch (Op) {
324 case ISD::LOAD:
325 case ISD::STORE:
327 case ISD::BITCAST:
328 case ISD::UNDEF:
332 case ISD::IS_FPCLASS:
333 break;
338 break;
339 default:
341 break;
342 }
343 }
344 }
345
347
348 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
349 // is expanded to avoid having two separate loops in case the index is a VGPR.
350
351 // Most operations are naturally 32-bit vector operations. We only support
352 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
353 for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) {
355 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
356
358 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
359
361 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
362
364 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
365 }
366
367 for (MVT Vec64 : { MVT::v3i64, MVT::v3f64 }) {
369 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32);
370
372 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v6i32);
373
375 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v6i32);
376
378 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v6i32);
379 }
380
381 for (MVT Vec64 : { MVT::v4i64, MVT::v4f64 }) {
383 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32);
384
386 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v8i32);
387
389 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v8i32);
390
392 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v8i32);
393 }
394
395 for (MVT Vec64 : { MVT::v8i64, MVT::v8f64 }) {
397 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v16i32);
398
400 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v16i32);
401
403 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v16i32);
404
406 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v16i32);
407 }
408
409 for (MVT Vec64 : { MVT::v16i64, MVT::v16f64 }) {
411 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v32i32);
412
414 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v32i32);
415
417 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v32i32);
418
420 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v32i32);
421 }
422
424 {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32},
425 Expand);
426
427 setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
428 Custom);
429
430 // Avoid stack access for these.
431 // TODO: Generalize to more vector types.
433 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
434 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
435 Custom);
436
437 // Deal with vec3 vector operations when widened to vec4.
439 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32}, Custom);
440
441 // Deal with vec5/6/7 vector operations when widened to vec8.
443 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
444 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
445 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
446 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
447 Custom);
448
449 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
450 // and output demarshalling
451 setOperationAction(ISD::ATOMIC_CMP_SWAP, {MVT::i32, MVT::i64}, Custom);
452
453 // We can't return success/failure, only the old value,
454 // let LLVM add the comparison
456 Expand);
457
458 setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);
459
460 setOperationAction(ISD::BITREVERSE, {MVT::i32, MVT::i64}, Legal);
461
462 // FIXME: This should be narrowed to i32, but that only happens if i64 is
463 // illegal.
464 // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
465 setOperationAction(ISD::BSWAP, {MVT::i64, MVT::i32}, Legal);
466
467 // On SI this is s_memtime and s_memrealtime on VI.
469
470 if (Subtarget->hasSMemRealTime() ||
474
475 if (Subtarget->has16BitInsts()) {
478 } else {
480 }
481
482 if (Subtarget->hasMadMacF32Insts())
484
485 if (!Subtarget->hasBFI())
486 // fcopysign can be done in a single instruction with BFI.
487 setOperationAction(ISD::FCOPYSIGN, {MVT::f32, MVT::f64}, Expand);
488
489 if (!Subtarget->hasBCNT(32))
491
492 if (!Subtarget->hasBCNT(64))
494
495 if (Subtarget->hasFFBH())
497
498 if (Subtarget->hasFFBL())
500
501 // We only really have 32-bit BFE instructions (and 16-bit on VI).
502 //
503 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
504 // effort to match them now. We want this to be false for i64 cases when the
505 // extraction isn't restricted to the upper or lower half. Ideally we would
506 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
507 // span the midpoint are probably relatively rare, so don't worry about them
508 // for now.
509 if (Subtarget->hasBFE())
511
512 // Clamp modifier on add/sub
513 if (Subtarget->hasIntClamp())
515
516 if (Subtarget->hasAddNoCarry())
517 setOperationAction({ISD::SADDSAT, ISD::SSUBSAT}, {MVT::i16, MVT::i32},
518 Legal);
519
520 setOperationAction({ISD::FMINNUM, ISD::FMAXNUM}, {MVT::f32, MVT::f64},
521 Custom);
522
523 // These are really only legal for ieee_mode functions. We should be avoiding
524 // them for functions that don't have ieee_mode enabled, so just say they are
525 // legal.
527 {MVT::f32, MVT::f64}, Legal);
528
529 if (Subtarget->haveRoundOpsF64())
531 Legal);
532 else
534 MVT::f64, Custom);
535
537 setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, {MVT::f32, MVT::f64},
538 Legal);
539 setOperationAction(ISD::FFREXP, {MVT::f32, MVT::f64}, Custom);
540
543
544 setOperationAction(ISD::BF16_TO_FP, {MVT::i16, MVT::f32, MVT::f64}, Expand);
545 setOperationAction(ISD::FP_TO_BF16, {MVT::i16, MVT::f32, MVT::f64}, Expand);
546
547 // Custom lower these because we can't specify a rule based on an illegal
548 // source bf16.
551
552 if (Subtarget->has16BitInsts()) {
555 MVT::i16, Legal);
556
557 AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
558
560 MVT::i16, Expand);
561
565 ISD::CTPOP},
566 MVT::i16, Promote);
567
569
570 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
571
573 AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
575 AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
576
580
582
583 // F16 - Constant Actions.
586
587 // F16 - Load/Store Actions.
589 AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
591 AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
592
593 // BF16 - Load/Store Actions.
595 AddPromotedToType(ISD::LOAD, MVT::bf16, MVT::i16);
597 AddPromotedToType(ISD::STORE, MVT::bf16, MVT::i16);
598
599 // F16 - VOP1 Actions.
602 MVT::f16, Custom);
603
606
607 // F16 - VOP2 Actions.
608 setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, {MVT::f16, MVT::bf16},
609 Expand);
613
614 // F16 - VOP3 Actions.
616 if (STI.hasMadF16())
618
619 for (MVT VT :
620 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
621 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
622 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
623 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
624 switch (Op) {
625 case ISD::LOAD:
626 case ISD::STORE:
628 case ISD::BITCAST:
629 case ISD::UNDEF:
634 case ISD::IS_FPCLASS:
635 break;
639 break;
640 default:
642 break;
643 }
644 }
645 }
646
647 // v_perm_b32 can handle either of these.
648 setOperationAction(ISD::BSWAP, {MVT::i16, MVT::v2i16}, Legal);
650
651 // XXX - Do these do anything? Vector constants turn into build_vector.
652 setOperationAction(ISD::Constant, {MVT::v2i16, MVT::v2f16}, Legal);
653
654 setOperationAction(ISD::UNDEF, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
655 Legal);
656
658 AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
660 AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
661
663 AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
665 AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
666
667 setOperationAction(ISD::AND, MVT::v2i16, Promote);
668 AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
669 setOperationAction(ISD::OR, MVT::v2i16, Promote);
670 AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
671 setOperationAction(ISD::XOR, MVT::v2i16, Promote);
672 AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
673
675 AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
677 AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
678 setOperationAction(ISD::LOAD, MVT::v4bf16, Promote);
679 AddPromotedToType(ISD::LOAD, MVT::v4bf16, MVT::v2i32);
680
682 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
684 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
686 AddPromotedToType(ISD::STORE, MVT::v4bf16, MVT::v2i32);
687
689 AddPromotedToType(ISD::LOAD, MVT::v8i16, MVT::v4i32);
691 AddPromotedToType(ISD::LOAD, MVT::v8f16, MVT::v4i32);
692 setOperationAction(ISD::LOAD, MVT::v8bf16, Promote);
693 AddPromotedToType(ISD::LOAD, MVT::v8bf16, MVT::v4i32);
694
696 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
698 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
699
701 AddPromotedToType(ISD::STORE, MVT::v8i16, MVT::v4i32);
703 AddPromotedToType(ISD::STORE, MVT::v8f16, MVT::v4i32);
705 AddPromotedToType(ISD::STORE, MVT::v8bf16, MVT::v4i32);
706
707 setOperationAction(ISD::LOAD, MVT::v16i16, Promote);
708 AddPromotedToType(ISD::LOAD, MVT::v16i16, MVT::v8i32);
709 setOperationAction(ISD::LOAD, MVT::v16f16, Promote);
710 AddPromotedToType(ISD::LOAD, MVT::v16f16, MVT::v8i32);
711 setOperationAction(ISD::LOAD, MVT::v16bf16, Promote);
712 AddPromotedToType(ISD::LOAD, MVT::v16bf16, MVT::v8i32);
713
715 AddPromotedToType(ISD::STORE, MVT::v16i16, MVT::v8i32);
717 AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32);
718 setOperationAction(ISD::STORE, MVT::v16bf16, Promote);
719 AddPromotedToType(ISD::STORE, MVT::v16bf16, MVT::v8i32);
720
721 setOperationAction(ISD::LOAD, MVT::v32i16, Promote);
722 AddPromotedToType(ISD::LOAD, MVT::v32i16, MVT::v16i32);
723 setOperationAction(ISD::LOAD, MVT::v32f16, Promote);
724 AddPromotedToType(ISD::LOAD, MVT::v32f16, MVT::v16i32);
725 setOperationAction(ISD::LOAD, MVT::v32bf16, Promote);
726 AddPromotedToType(ISD::LOAD, MVT::v32bf16, MVT::v16i32);
727
729 AddPromotedToType(ISD::STORE, MVT::v32i16, MVT::v16i32);
731 AddPromotedToType(ISD::STORE, MVT::v32f16, MVT::v16i32);
732 setOperationAction(ISD::STORE, MVT::v32bf16, Promote);
733 AddPromotedToType(ISD::STORE, MVT::v32bf16, MVT::v16i32);
734
736 MVT::v2i32, Expand);
738
740 MVT::v4i32, Expand);
741
743 MVT::v8i32, Expand);
744
745 setOperationAction(ISD::BUILD_VECTOR, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
746 Subtarget->hasVOP3PInsts() ? Legal : Custom);
747
748 setOperationAction(ISD::FNEG, MVT::v2f16, Legal);
749 // This isn't really legal, but this avoids the legalizer unrolling it (and
750 // allows matching fneg (fabs x) patterns)
751 setOperationAction(ISD::FABS, MVT::v2f16, Legal);
752
755
757 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
758 Custom);
759
761 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
762 Expand);
763
764 for (MVT Vec16 :
765 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
766 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
769 Vec16, Custom);
771 }
772 }
773
774 if (Subtarget->hasVOP3PInsts()) {
778 MVT::v2i16, Legal);
779
782 MVT::v2f16, Legal);
783
784 setOperationAction(ISD::EXTRACT_VECTOR_ELT, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
785 Custom);
786
788 {MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::v8i16,
789 MVT::v16f16, MVT::v16i16, MVT::v32f16, MVT::v32i16},
790 Custom);
791
792 for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
793 // Split vector operations.
798 VT, Custom);
799
800 for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
801 // Split vector operations.
803 VT, Custom);
804
805 setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, {MVT::v2f16, MVT::v4f16},
806 Custom);
807
808 setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
809 setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16, MVT::v4bf16},
810 Custom);
811
812 if (Subtarget->hasPackedFP32Ops()) {
814 MVT::v2f32, Legal);
816 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
817 Custom);
818 }
819 }
820
822
823 if (Subtarget->has16BitInsts()) {
825 AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
827 AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
828 } else {
829 // Legalization hack.
830 setOperationAction(ISD::SELECT, {MVT::v2i16, MVT::v2f16}, Custom);
831
833 }
834
836 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
837 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
838 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
839 MVT::v32f16, MVT::v32bf16},
840 Custom);
841
843
844 if (Subtarget->hasScalarSMulU64())
846
847 if (Subtarget->hasMad64_32())
849
850 if (Subtarget->hasPrefetch())
852
853 if (Subtarget->hasIEEEMinMax()) {
855 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Legal);
857 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
858 Custom);
859 }
860
862 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
863 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
864 MVT::i8},
865 Custom);
866
868 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
869 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
870 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
871 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
872 Custom);
873
875 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
876 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
877 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
878 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
879 Custom);
880
886
887 // TODO: Could move this to custom lowering, could benefit from combines on
888 // extract of relevant bits.
890
892
895 ISD::SUB,
897 ISD::FADD,
898 ISD::FSUB,
899 ISD::FDIV,
906 ISD::FMA,
907 ISD::SMIN,
908 ISD::SMAX,
909 ISD::UMIN,
910 ISD::UMAX,
912 ISD::AND,
913 ISD::OR,
914 ISD::XOR,
915 ISD::FSHR,
925
926 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
928
929 // All memory operations. Some folding on the pointer operand is done to help
930 // matching the constant offsets in the addressing modes.
955
956 // FIXME: In other contexts we pretend this is a per-function property.
958
960}
961
963 return Subtarget;
964}
965
967 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
968 return RCRegs;
969}
970
971//===----------------------------------------------------------------------===//
972// TargetLowering queries
973//===----------------------------------------------------------------------===//
974
975// v_mad_mix* support a conversion from f16 to f32.
976//
977// There is only one special case when denormals are enabled we don't currently,
978// where this is OK to use.
979bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
980 EVT DestVT, EVT SrcVT) const {
981 return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
982 (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
983 DestVT.getScalarType() == MVT::f32 &&
984 SrcVT.getScalarType() == MVT::f16 &&
985 // TODO: This probably only requires no input flushing?
987}
988
990 LLT DestTy, LLT SrcTy) const {
991 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
992 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
993 DestTy.getScalarSizeInBits() == 32 &&
994 SrcTy.getScalarSizeInBits() == 16 &&
995 // TODO: This probably only requires no input flushing?
997}
998
1000 // SI has some legal vector types, but no legal vector operations. Say no
1001 // shuffles are legal in order to prefer scalarizing some vector operations.
1002 return false;
1003}
1004
1007 EVT VT) const {
1010
1011 if (VT.isVector()) {
1012 EVT ScalarVT = VT.getScalarType();
1013 unsigned Size = ScalarVT.getSizeInBits();
1014 if (Size == 16) {
1015 if (Subtarget->has16BitInsts()) {
1016 if (VT.isInteger())
1017 return MVT::v2i16;
1018 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1019 }
1020 return VT.isInteger() ? MVT::i32 : MVT::f32;
1021 }
1022
1023 if (Size < 16)
1024 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1025 return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
1026 }
1027
1028 if (VT.getSizeInBits() > 32)
1029 return MVT::i32;
1030
1032}
1033
1036 EVT VT) const {
1039
1040 if (VT.isVector()) {
1041 unsigned NumElts = VT.getVectorNumElements();
1042 EVT ScalarVT = VT.getScalarType();
1043 unsigned Size = ScalarVT.getSizeInBits();
1044
1045 // FIXME: Should probably promote 8-bit vectors to i16.
1046 if (Size == 16 && Subtarget->has16BitInsts())
1047 return (NumElts + 1) / 2;
1048
1049 if (Size <= 32)
1050 return NumElts;
1051
1052 if (Size > 32)
1053 return NumElts * ((Size + 31) / 32);
1054 } else if (VT.getSizeInBits() > 32)
1055 return (VT.getSizeInBits() + 31) / 32;
1056
1058}
1059
1061 LLVMContext &Context, CallingConv::ID CC,
1062 EVT VT, EVT &IntermediateVT,
1063 unsigned &NumIntermediates, MVT &RegisterVT) const {
1064 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
1065 unsigned NumElts = VT.getVectorNumElements();
1066 EVT ScalarVT = VT.getScalarType();
1067 unsigned Size = ScalarVT.getSizeInBits();
1068 // FIXME: We should fix the ABI to be the same on targets without 16-bit
1069 // support, but unless we can properly handle 3-vectors, it will be still be
1070 // inconsistent.
1071 if (Size == 16 && Subtarget->has16BitInsts()) {
1072 if (ScalarVT == MVT::bf16) {
1073 RegisterVT = MVT::i32;
1074 IntermediateVT = MVT::v2bf16;
1075 } else {
1076 RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
1077 IntermediateVT = RegisterVT;
1078 }
1079 NumIntermediates = (NumElts + 1) / 2;
1080 return NumIntermediates;
1081 }
1082
1083 if (Size == 32) {
1084 RegisterVT = ScalarVT.getSimpleVT();
1085 IntermediateVT = RegisterVT;
1086 NumIntermediates = NumElts;
1087 return NumIntermediates;
1088 }
1089
1090 if (Size < 16 && Subtarget->has16BitInsts()) {
1091 // FIXME: Should probably form v2i16 pieces
1092 RegisterVT = MVT::i16;
1093 IntermediateVT = ScalarVT;
1094 NumIntermediates = NumElts;
1095 return NumIntermediates;
1096 }
1097
1098
1099 if (Size != 16 && Size <= 32) {
1100 RegisterVT = MVT::i32;
1101 IntermediateVT = ScalarVT;
1102 NumIntermediates = NumElts;
1103 return NumIntermediates;
1104 }
1105
1106 if (Size > 32) {
1107 RegisterVT = MVT::i32;
1108 IntermediateVT = RegisterVT;
1109 NumIntermediates = NumElts * ((Size + 31) / 32);
1110 return NumIntermediates;
1111 }
1112 }
1113
1115 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1116}
1117
1119 const DataLayout &DL, Type *Ty,
1120 unsigned MaxNumLanes) {
1121 assert(MaxNumLanes != 0);
1122
1123 LLVMContext &Ctx = Ty->getContext();
1124 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1125 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1126 return EVT::getVectorVT(Ctx, TLI.getValueType(DL, VT->getElementType()),
1127 NumElts);
1128 }
1129
1130 return TLI.getValueType(DL, Ty);
1131}
1132
1133// Peek through TFE struct returns to only use the data size.
1135 const DataLayout &DL, Type *Ty,
1136 unsigned MaxNumLanes) {
1137 auto *ST = dyn_cast<StructType>(Ty);
1138 if (!ST)
1139 return memVTFromLoadIntrData(TLI, DL, Ty, MaxNumLanes);
1140
1141 // TFE intrinsics return an aggregate type.
1142 assert(ST->getNumContainedTypes() == 2 &&
1143 ST->getContainedType(1)->isIntegerTy(32));
1144 return memVTFromLoadIntrData(TLI, DL, ST->getContainedType(0), MaxNumLanes);
1145}
1146
1147/// Map address space 7 to MVT::v5i32 because that's its in-memory
1148/// representation. This return value is vector-typed because there is no
1149/// MVT::i160 and it is not clear if one can be added. While this could
1150/// cause issues during codegen, these address space 7 pointers will be
1151/// rewritten away by then. Therefore, we can return MVT::v5i32 in order
1152/// to allow pre-codegen passes that query TargetTransformInfo, often for cost
1153/// modeling, to work.
1155 if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160)
1156 return MVT::v5i32;
1158 DL.getPointerSizeInBits(AS) == 192)
1159 return MVT::v6i32;
1161}
1162/// Similarly, the in-memory representation of a p7 is {p8, i32}, aka
1163/// v8i32 when padding is added.
1164/// The in-memory representation of a p9 is {p8, i32, i32}, which is
1165/// also v8i32 with padding.
1167 if ((AMDGPUAS::BUFFER_FAT_POINTER == AS &&
1168 DL.getPointerSizeInBits(AS) == 160) ||
1170 DL.getPointerSizeInBits(AS) == 192))
1171 return MVT::v8i32;
1173}
1174
1176 const CallInst &CI,
1177 MachineFunction &MF,
1178 unsigned IntrID) const {
1180 if (CI.hasMetadata(LLVMContext::MD_invariant_load))
1182
1183 if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
1186 (Intrinsic::ID)IntrID);
1187 MemoryEffects ME = Attr.getMemoryEffects();
1188 if (ME.doesNotAccessMemory())
1189 return false;
1190
1191 // TODO: Should images get their own address space?
1192 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1193
1194 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = nullptr;
1195 if (RsrcIntr->IsImage) {
1198 BaseOpcode = AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
1199 Info.align.reset();
1200 }
1201
1202 Value *RsrcArg = CI.getArgOperand(RsrcIntr->RsrcArg);
1203 if (auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->getType())) {
1204 if (RsrcPtrTy->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
1205 // We conservatively set the memory operand of a buffer intrinsic to the
1206 // base resource pointer, so that we can access alias information about
1207 // those pointers. Cases like "this points at the same value
1208 // but with a different offset" are handled in
1209 // areMemAccessesTriviallyDisjoint.
1210 Info.ptrVal = RsrcArg;
1211 }
1212
1213 auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
1214 if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
1217 if (ME.onlyReadsMemory()) {
1218 if (RsrcIntr->IsImage) {
1219 unsigned MaxNumLanes = 4;
1220
1221 if (!BaseOpcode->Gather4) {
1222 // If this isn't a gather, we may have excess loaded elements in the
1223 // IR type. Check the dmask for the real number of elements loaded.
1224 unsigned DMask
1225 = cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue();
1226 MaxNumLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1227 }
1228
1229 Info.memVT = memVTFromLoadIntrReturn(*this, MF.getDataLayout(),
1230 CI.getType(), MaxNumLanes);
1231 } else {
1232 Info.memVT =
1234 std::numeric_limits<unsigned>::max());
1235 }
1236
1237 // FIXME: What does alignment mean for an image?
1240 } else if (ME.onlyWritesMemory()) {
1242
1243 Type *DataTy = CI.getArgOperand(0)->getType();
1244 if (RsrcIntr->IsImage) {
1245 unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue();
1246 unsigned DMaskLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1247 Info.memVT = memVTFromLoadIntrData(*this, MF.getDataLayout(), DataTy,
1248 DMaskLanes);
1249 } else
1250 Info.memVT = getValueType(MF.getDataLayout(), DataTy);
1251
1253 } else {
1254 // Atomic or NoReturn Sampler
1255 Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID :
1260
1261 switch (IntrID) {
1262 default:
1263 if (RsrcIntr->IsImage && BaseOpcode->NoReturn) {
1264 // Fake memory access type for no return sampler intrinsics
1265 Info.memVT = MVT::i32;
1266 } else {
1267 // XXX - Should this be volatile without known ordering?
1269 Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
1270 }
1271 break;
1272 case Intrinsic::amdgcn_raw_buffer_load_lds:
1273 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1274 case Intrinsic::amdgcn_struct_buffer_load_lds:
1275 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1276 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1277 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1278 Info.ptrVal = CI.getArgOperand(1);
1279 return true;
1280 }
1281 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1282 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1283 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1284 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1285 Info.memVT =
1287 std::numeric_limits<unsigned>::max());
1288 Info.flags &= ~MachineMemOperand::MOStore;
1289 return true;
1290 }
1291 }
1292 }
1293 return true;
1294 }
1295
1296 switch (IntrID) {
1297 case Intrinsic::amdgcn_ds_ordered_add:
1298 case Intrinsic::amdgcn_ds_ordered_swap: {
1300 Info.memVT = MVT::getVT(CI.getType());
1301 Info.ptrVal = CI.getOperand(0);
1302 Info.align.reset();
1304
1305 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));
1306 if (!Vol->isZero())
1308
1309 return true;
1310 }
1311 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1312 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1314 Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
1315 Info.ptrVal = nullptr;
1316 Info.fallbackAddressSpace = AMDGPUAS::STREAMOUT_REGISTER;
1318 return true;
1319 }
1320 case Intrinsic::amdgcn_ds_append:
1321 case Intrinsic::amdgcn_ds_consume: {
1323 Info.memVT = MVT::getVT(CI.getType());
1324 Info.ptrVal = CI.getOperand(0);
1325 Info.align.reset();
1327
1328 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));
1329 if (!Vol->isZero())
1331
1332 return true;
1333 }
1334 case Intrinsic::amdgcn_global_atomic_csub: {
1336 Info.memVT = MVT::getVT(CI.getType());
1337 Info.ptrVal = CI.getOperand(0);
1338 Info.align.reset();
1342 return true;
1343 }
1344 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
1346 Info.memVT = MVT::getVT(CI.getType()); // XXX: what is correct VT?
1347
1348 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1349 Info.align.reset();
1352 return true;
1353 }
1354 case Intrinsic::amdgcn_global_atomic_fadd:
1355 case Intrinsic::amdgcn_global_atomic_fmin:
1356 case Intrinsic::amdgcn_global_atomic_fmax:
1357 case Intrinsic::amdgcn_global_atomic_fmin_num:
1358 case Intrinsic::amdgcn_global_atomic_fmax_num:
1359 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1360 case Intrinsic::amdgcn_flat_atomic_fadd:
1361 case Intrinsic::amdgcn_flat_atomic_fmin:
1362 case Intrinsic::amdgcn_flat_atomic_fmax:
1363 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1364 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1365 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
1366 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1367 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: {
1369 Info.memVT = MVT::getVT(CI.getType());
1370 Info.ptrVal = CI.getOperand(0);
1371 Info.align.reset();
1376 return true;
1377 }
1378 case Intrinsic::amdgcn_global_load_tr_b64:
1379 case Intrinsic::amdgcn_global_load_tr_b128: {
1381 Info.memVT = MVT::getVT(CI.getType());
1382 Info.ptrVal = CI.getOperand(0);
1383 Info.align.reset();
1385 return true;
1386 }
1387 case Intrinsic::amdgcn_ds_gws_init:
1388 case Intrinsic::amdgcn_ds_gws_barrier:
1389 case Intrinsic::amdgcn_ds_gws_sema_v:
1390 case Intrinsic::amdgcn_ds_gws_sema_br:
1391 case Intrinsic::amdgcn_ds_gws_sema_p:
1392 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1394
1395 const GCNTargetMachine &TM =
1396 static_cast<const GCNTargetMachine &>(getTargetMachine());
1397
1399 Info.ptrVal = MFI->getGWSPSV(TM);
1400
1401 // This is an abstract access, but we need to specify a type and size.
1402 Info.memVT = MVT::i32;
1403 Info.size = 4;
1404 Info.align = Align(4);
1405
1406 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1408 else
1410 return true;
1411 }
1412 case Intrinsic::amdgcn_global_load_lds: {
1414 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1415 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1416 Info.ptrVal = CI.getArgOperand(1);
1418 return true;
1419 }
1420 case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
1422
1423 const GCNTargetMachine &TM =
1424 static_cast<const GCNTargetMachine &>(getTargetMachine());
1425
1427 Info.ptrVal = MFI->getGWSPSV(TM);
1428
1429 // This is an abstract access, but we need to specify a type and size.
1430 Info.memVT = MVT::i32;
1431 Info.size = 4;
1432 Info.align = Align(4);
1433
1435 return true;
1436 }
1437 default:
1438 return false;
1439 }
1440}
1441
1443 const CallInst &I, SmallVectorImpl<SDValue> &Ops, SelectionDAG &DAG) const {
1444 switch (cast<IntrinsicInst>(I).getIntrinsicID()) {
1445 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1446 // The DAG's ValueType loses the addrspaces.
1447 // Add them as 2 extra Constant operands "from" and "to".
1448 unsigned SrcAS = I.getOperand(0)->getType()->getPointerAddressSpace();
1449 unsigned DstAS = I.getType()->getPointerAddressSpace();
1450 Ops.push_back(DAG.getTargetConstant(SrcAS, SDLoc(), MVT::i32));
1451 Ops.push_back(DAG.getTargetConstant(DstAS, SDLoc(), MVT::i32));
1452 break;
1453 }
1454 default:
1455 break;
1456 }
1457}
1458
1461 Type *&AccessTy) const {
1462 Value *Ptr = nullptr;
1463 switch (II->getIntrinsicID()) {
1464 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1465 case Intrinsic::amdgcn_ds_append:
1466 case Intrinsic::amdgcn_ds_consume:
1467 case Intrinsic::amdgcn_ds_ordered_add:
1468 case Intrinsic::amdgcn_ds_ordered_swap:
1469 case Intrinsic::amdgcn_flat_atomic_fadd:
1470 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
1471 case Intrinsic::amdgcn_flat_atomic_fmax:
1472 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1473 case Intrinsic::amdgcn_flat_atomic_fmin:
1474 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1475 case Intrinsic::amdgcn_global_atomic_csub:
1476 case Intrinsic::amdgcn_global_atomic_fadd:
1477 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
1478 case Intrinsic::amdgcn_global_atomic_fmax:
1479 case Intrinsic::amdgcn_global_atomic_fmax_num:
1480 case Intrinsic::amdgcn_global_atomic_fmin:
1481 case Intrinsic::amdgcn_global_atomic_fmin_num:
1482 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1483 case Intrinsic::amdgcn_global_load_tr_b64:
1484 case Intrinsic::amdgcn_global_load_tr_b128:
1485 Ptr = II->getArgOperand(0);
1486 break;
1487 case Intrinsic::amdgcn_global_load_lds:
1488 Ptr = II->getArgOperand(1);
1489 break;
1490 default:
1491 return false;
1492 }
1493 AccessTy = II->getType();
1494 Ops.push_back(Ptr);
1495 return true;
1496}
1497
1499 unsigned AddrSpace) const {
1500 if (!Subtarget->hasFlatInstOffsets()) {
1501 // Flat instructions do not have offsets, and only have the register
1502 // address.
1503 return AM.BaseOffs == 0 && AM.Scale == 0;
1504 }
1505
1506 decltype(SIInstrFlags::FLAT) FlatVariant =
1510
1511 return AM.Scale == 0 &&
1512 (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1513 AM.BaseOffs, AddrSpace, FlatVariant));
1514}
1515
1517 if (Subtarget->hasFlatGlobalInsts())
1519
1520 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1521 // Assume the we will use FLAT for all global memory accesses
1522 // on VI.
1523 // FIXME: This assumption is currently wrong. On VI we still use
1524 // MUBUF instructions for the r + i addressing mode. As currently
1525 // implemented, the MUBUF instructions only work on buffer < 4GB.
1526 // It may be possible to support > 4GB buffers with MUBUF instructions,
1527 // by setting the stride value in the resource descriptor which would
1528 // increase the size limit to (stride * 4GB). However, this is risky,
1529 // because it has never been validated.
1531 }
1532
1533 return isLegalMUBUFAddressingMode(AM);
1534}
1535
1536bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1537 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1538 // additionally can do r + r + i with addr64. 32-bit has more addressing
1539 // mode options. Depending on the resource constant, it can also do
1540 // (i64 r0) + (i32 r1) * (i14 i).
1541 //
1542 // Private arrays end up using a scratch buffer most of the time, so also
1543 // assume those use MUBUF instructions. Scratch loads / stores are currently
1544 // implemented as mubuf instructions with offen bit set, so slightly
1545 // different than the normal addr64.
1546 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1547 if (!TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1548 return false;
1549
1550 // FIXME: Since we can split immediate into soffset and immediate offset,
1551 // would it make sense to allow any immediate?
1552
1553 switch (AM.Scale) {
1554 case 0: // r + i or just i, depending on HasBaseReg.
1555 return true;
1556 case 1:
1557 return true; // We have r + r or r + i.
1558 case 2:
1559 if (AM.HasBaseReg) {
1560 // Reject 2 * r + r.
1561 return false;
1562 }
1563
1564 // Allow 2 * r as r + r
1565 // Or 2 * r + i is allowed as r + r + i.
1566 return true;
1567 default: // Don't allow n * r
1568 return false;
1569 }
1570}
1571
1573 const AddrMode &AM, Type *Ty,
1574 unsigned AS, Instruction *I) const {
1575 // No global is ever allowed as a base.
1576 if (AM.BaseGV)
1577 return false;
1578
1579 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
1580 return isLegalGlobalAddressingMode(AM);
1581
1582 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
1586 // If the offset isn't a multiple of 4, it probably isn't going to be
1587 // correctly aligned.
1588 // FIXME: Can we get the real alignment here?
1589 if (AM.BaseOffs % 4 != 0)
1590 return isLegalMUBUFAddressingMode(AM);
1591
1592 if (!Subtarget->hasScalarSubwordLoads()) {
1593 // There are no SMRD extloads, so if we have to do a small type access we
1594 // will use a MUBUF load.
1595 // FIXME?: We also need to do this if unaligned, but we don't know the
1596 // alignment here.
1597 if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
1598 return isLegalGlobalAddressingMode(AM);
1599 }
1600
1602 // SMRD instructions have an 8-bit, dword offset on SI.
1603 if (!isUInt<8>(AM.BaseOffs / 4))
1604 return false;
1605 } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
1606 // On CI+, this can also be a 32-bit literal constant offset. If it fits
1607 // in 8-bits, it can use a smaller encoding.
1608 if (!isUInt<32>(AM.BaseOffs / 4))
1609 return false;
1610 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX9) {
1611 // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1612 if (!isUInt<20>(AM.BaseOffs))
1613 return false;
1614 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) {
1615 // On GFX9 the offset is signed 21-bit in bytes (but must not be negative
1616 // for S_BUFFER_* instructions).
1617 if (!isInt<21>(AM.BaseOffs))
1618 return false;
1619 } else {
1620 // On GFX12, all offsets are signed 24-bit in bytes.
1621 if (!isInt<24>(AM.BaseOffs))
1622 return false;
1623 }
1624
1625 if ((AS == AMDGPUAS::CONSTANT_ADDRESS ||
1627 AM.BaseOffs < 0) {
1628 // Scalar (non-buffer) loads can only use a negative offset if
1629 // soffset+offset is non-negative. Since the compiler can only prove that
1630 // in a few special cases, it is safer to claim that negative offsets are
1631 // not supported.
1632 return false;
1633 }
1634
1635 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1636 return true;
1637
1638 if (AM.Scale == 1 && AM.HasBaseReg)
1639 return true;
1640
1641 return false;
1642 }
1643
1644 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
1645 return Subtarget->enableFlatScratch()
1647 : isLegalMUBUFAddressingMode(AM);
1648
1649 if (AS == AMDGPUAS::LOCAL_ADDRESS ||
1650 (AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) {
1651 // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1652 // field.
1653 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1654 // an 8-bit dword offset but we don't know the alignment here.
1655 if (!isUInt<16>(AM.BaseOffs))
1656 return false;
1657
1658 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1659 return true;
1660
1661 if (AM.Scale == 1 && AM.HasBaseReg)
1662 return true;
1663
1664 return false;
1665 }
1666
1668 // For an unknown address space, this usually means that this is for some
1669 // reason being used for pure arithmetic, and not based on some addressing
1670 // computation. We don't have instructions that compute pointers with any
1671 // addressing modes, so treat them as having no offset like flat
1672 // instructions.
1674 }
1675
1676 // Assume a user alias of global for unknown address spaces.
1677 return isLegalGlobalAddressingMode(AM);
1678}
1679
1681 const MachineFunction &MF) const {
1683 return (MemVT.getSizeInBits() <= 4 * 32);
1684 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1685 unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
1686 return (MemVT.getSizeInBits() <= MaxPrivateBits);
1687 }
1689 return (MemVT.getSizeInBits() <= 2 * 32);
1690 return true;
1691}
1692
1694 unsigned Size, unsigned AddrSpace, Align Alignment,
1695 MachineMemOperand::Flags Flags, unsigned *IsFast) const {
1696 if (IsFast)
1697 *IsFast = 0;
1698
1699 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1700 AddrSpace == AMDGPUAS::REGION_ADDRESS) {
1701 // Check if alignment requirements for ds_read/write instructions are
1702 // disabled.
1703 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4))
1704 return false;
1705
1706 Align RequiredAlignment(PowerOf2Ceil(Size/8)); // Natural alignment.
1707 if (Subtarget->hasLDSMisalignedBug() && Size > 32 &&
1708 Alignment < RequiredAlignment)
1709 return false;
1710
1711 // Either, the alignment requirements are "enabled", or there is an
1712 // unaligned LDS access related hardware bug though alignment requirements
1713 // are "disabled". In either case, we need to check for proper alignment
1714 // requirements.
1715 //
1716 switch (Size) {
1717 case 64:
1718 // SI has a hardware bug in the LDS / GDS bounds checking: if the base
1719 // address is negative, then the instruction is incorrectly treated as
1720 // out-of-bounds even if base + offsets is in bounds. Split vectorized
1721 // loads here to avoid emitting ds_read2_b32. We may re-combine the
1722 // load later in the SILoadStoreOptimizer.
1723 if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8))
1724 return false;
1725
1726 // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
1727 // can do a 4 byte aligned, 8 byte access in a single operation using
1728 // ds_read2/write2_b32 with adjacent offsets.
1729 RequiredAlignment = Align(4);
1730
1731 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1732 // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/
1733 // ds_write2_b32 depending on the alignment. In either case with either
1734 // alignment there is no faster way of doing this.
1735
1736 // The numbers returned here and below are not additive, it is a 'speed
1737 // rank'. They are just meant to be compared to decide if a certain way
1738 // of lowering an operation is faster than another. For that purpose
1739 // naturally aligned operation gets it bitsize to indicate that "it
1740 // operates with a speed comparable to N-bit wide load". With the full
1741 // alignment ds128 is slower than ds96 for example. If underaligned it
1742 // is comparable to a speed of a single dword access, which would then
1743 // mean 32 < 128 and it is faster to issue a wide load regardless.
1744 // 1 is simply "slow, don't do it". I.e. comparing an aligned load to a
1745 // wider load which will not be aligned anymore the latter is slower.
1746 if (IsFast)
1747 *IsFast = (Alignment >= RequiredAlignment) ? 64
1748 : (Alignment < Align(4)) ? 32
1749 : 1;
1750 return true;
1751 }
1752
1753 break;
1754 case 96:
1755 if (!Subtarget->hasDS96AndDS128())
1756 return false;
1757
1758 // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
1759 // gfx8 and older.
1760
1761 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1762 // Naturally aligned access is fastest. However, also report it is Fast
1763 // if memory is aligned less than DWORD. A narrow load or store will be
1764 // be equally slow as a single ds_read_b96/ds_write_b96, but there will
1765 // be more of them, so overall we will pay less penalty issuing a single
1766 // instruction.
1767
1768 // See comment on the values above.
1769 if (IsFast)
1770 *IsFast = (Alignment >= RequiredAlignment) ? 96
1771 : (Alignment < Align(4)) ? 32
1772 : 1;
1773 return true;
1774 }
1775
1776 break;
1777 case 128:
1778 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
1779 return false;
1780
1781 // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
1782 // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a
1783 // single operation using ds_read2/write2_b64.
1784 RequiredAlignment = Align(8);
1785
1786 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1787 // Naturally aligned access is fastest. However, also report it is Fast
1788 // if memory is aligned less than DWORD. A narrow load or store will be
1789 // be equally slow as a single ds_read_b128/ds_write_b128, but there
1790 // will be more of them, so overall we will pay less penalty issuing a
1791 // single instruction.
1792
1793 // See comment on the values above.
1794 if (IsFast)
1795 *IsFast = (Alignment >= RequiredAlignment) ? 128
1796 : (Alignment < Align(4)) ? 32
1797 : 1;
1798 return true;
1799 }
1800
1801 break;
1802 default:
1803 if (Size > 32)
1804 return false;
1805
1806 break;
1807 }
1808
1809 // See comment on the values above.
1810 // Note that we have a single-dword or sub-dword here, so if underaligned
1811 // it is a slowest possible access, hence returned value is 0.
1812 if (IsFast)
1813 *IsFast = (Alignment >= RequiredAlignment) ? Size : 0;
1814
1815 return Alignment >= RequiredAlignment ||
1816 Subtarget->hasUnalignedDSAccessEnabled();
1817 }
1818
1819 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
1820 bool AlignedBy4 = Alignment >= Align(4);
1821 if (IsFast)
1822 *IsFast = AlignedBy4;
1823
1824 return AlignedBy4 ||
1825 Subtarget->enableFlatScratch() ||
1826 Subtarget->hasUnalignedScratchAccess();
1827 }
1828
1829 // FIXME: We have to be conservative here and assume that flat operations
1830 // will access scratch. If we had access to the IR function, then we
1831 // could determine if any private memory was used in the function.
1832 if (AddrSpace == AMDGPUAS::FLAT_ADDRESS &&
1833 !Subtarget->hasUnalignedScratchAccess()) {
1834 bool AlignedBy4 = Alignment >= Align(4);
1835 if (IsFast)
1836 *IsFast = AlignedBy4;
1837
1838 return AlignedBy4;
1839 }
1840
1841 // So long as they are correct, wide global memory operations perform better
1842 // than multiple smaller memory ops -- even when misaligned
1843 if (AMDGPU::isExtendedGlobalAddrSpace(AddrSpace)) {
1844 if (IsFast)
1845 *IsFast = Size;
1846
1847 return Alignment >= Align(4) ||
1849 }
1850
1851 // Smaller than dword value must be aligned.
1852 if (Size < 32)
1853 return false;
1854
1855 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
1856 // byte-address are ignored, thus forcing Dword alignment.
1857 // This applies to private, global, and constant memory.
1858 if (IsFast)
1859 *IsFast = 1;
1860
1861 return Size >= 32 && Alignment >= Align(4);
1862}
1863
1865 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
1866 unsigned *IsFast) const {
1868 Alignment, Flags, IsFast);
1869}
1870
1872 const MemOp &Op, const AttributeList &FuncAttributes) const {
1873 // FIXME: Should account for address space here.
1874
1875 // The default fallback uses the private pointer size as a guess for a type to
1876 // use. Make sure we switch these to 64-bit accesses.
1877
1878 if (Op.size() >= 16 &&
1879 Op.isDstAligned(Align(4))) // XXX: Should only do for global
1880 return MVT::v4i32;
1881
1882 if (Op.size() >= 8 && Op.isDstAligned(Align(4)))
1883 return MVT::v2i32;
1884
1885 // Use the default.
1886 return MVT::Other;
1887}
1888
1890 const MemSDNode *MemNode = cast<MemSDNode>(N);
1891 return MemNode->getMemOperand()->getFlags() & MONoClobber;
1892}
1893
1895 return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS ||
1897}
1898
1900 unsigned DestAS) const {
1901 // Flat -> private/local is a simple truncate.
1902 // Flat -> global is no-op
1903 if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
1904 return true;
1905
1906 const GCNTargetMachine &TM =
1907 static_cast<const GCNTargetMachine &>(getTargetMachine());
1908 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
1909}
1910
1912 const MemSDNode *MemNode = cast<MemSDNode>(N);
1913
1915}
1916
1919 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
1920 VT.getScalarType().bitsLE(MVT::i16))
1923}
1924
1926 Type *Ty) const {
1927 // FIXME: Could be smarter if called for vector constants.
1928 return true;
1929}
1930
1932 unsigned Index) const {
1934 return false;
1935
1936 // TODO: Add more cases that are cheap.
1937 return Index == 0;
1938}
1939
1941 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
1942 switch (Op) {
1943 case ISD::LOAD:
1944 case ISD::STORE:
1945
1946 // These operations are done with 32-bit instructions anyway.
1947 case ISD::AND:
1948 case ISD::OR:
1949 case ISD::XOR:
1950 case ISD::SELECT:
1951 // TODO: Extensions?
1952 return true;
1953 default:
1954 return false;
1955 }
1956 }
1957
1958 // SimplifySetCC uses this function to determine whether or not it should
1959 // create setcc with i1 operands. We don't have instructions for i1 setcc.
1960 if (VT == MVT::i1 && Op == ISD::SETCC)
1961 return false;
1962
1964}
1965
1966SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
1967 const SDLoc &SL,
1968 SDValue Chain,
1969 uint64_t Offset) const {
1970 const DataLayout &DL = DAG.getDataLayout();
1973
1974 const ArgDescriptor *InputPtrReg;
1975 const TargetRegisterClass *RC;
1976 LLT ArgTy;
1978
1979 std::tie(InputPtrReg, RC, ArgTy) =
1981
1982 // We may not have the kernarg segment argument if we have no kernel
1983 // arguments.
1984 if (!InputPtrReg)
1985 return DAG.getConstant(Offset, SL, PtrVT);
1986
1988 SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
1989 MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
1990
1991 return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Offset));
1992}
1993
1994SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
1995 const SDLoc &SL) const {
1998 return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
1999}
2000
2001SDValue SITargetLowering::getLDSKernelId(SelectionDAG &DAG,
2002 const SDLoc &SL) const {
2003
2005 std::optional<uint32_t> KnownSize =
2007 if (KnownSize.has_value())
2008 return DAG.getConstant(*KnownSize, SL, MVT::i32);
2009 return SDValue();
2010}
2011
2012SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
2013 const SDLoc &SL, SDValue Val,
2014 bool Signed,
2015 const ISD::InputArg *Arg) const {
2016 // First, if it is a widened vector, narrow it.
2017 if (VT.isVector() &&
2019 EVT NarrowedVT =
2022 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val,
2023 DAG.getConstant(0, SL, MVT::i32));
2024 }
2025
2026 // Then convert the vector elements or scalar value.
2027 if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) &&
2028 VT.bitsLT(MemVT)) {
2029 unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
2030 Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
2031 }
2032
2033 if (MemVT.isFloatingPoint())
2034 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2035 else if (Signed)
2036 Val = DAG.getSExtOrTrunc(Val, SL, VT);
2037 else
2038 Val = DAG.getZExtOrTrunc(Val, SL, VT);
2039
2040 return Val;
2041}
2042
2043SDValue SITargetLowering::lowerKernargMemParameter(
2044 SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
2045 uint64_t Offset, Align Alignment, bool Signed,
2046 const ISD::InputArg *Arg) const {
2048
2049 // Try to avoid using an extload by loading earlier than the argument address,
2050 // and extracting the relevant bits. The load should hopefully be merged with
2051 // the previous argument.
2052 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2053 // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
2054 int64_t AlignDownOffset = alignDown(Offset, 4);
2055 int64_t OffsetDiff = Offset - AlignDownOffset;
2056
2057 EVT IntVT = MemVT.changeTypeToInteger();
2058
2059 // TODO: If we passed in the base kernel offset we could have a better
2060 // alignment than 4, but we don't really need it.
2061 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2062 SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, Align(4),
2065
2066 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
2067 SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
2068
2069 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
2070 ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
2071 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
2072
2073
2074 return DAG.getMergeValues({ ArgVal, Load.getValue(1) }, SL);
2075 }
2076
2077 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
2078 SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Alignment,
2081
2082 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
2083 return DAG.getMergeValues({ Val, Load.getValue(1) }, SL);
2084}
2085
2086SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
2087 const SDLoc &SL, SDValue Chain,
2088 const ISD::InputArg &Arg) const {
2090 MachineFrameInfo &MFI = MF.getFrameInfo();
2091
2092 if (Arg.Flags.isByVal()) {
2093 unsigned Size = Arg.Flags.getByValSize();
2094 int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
2095 return DAG.getFrameIndex(FrameIdx, MVT::i32);
2096 }
2097
2098 unsigned ArgOffset = VA.getLocMemOffset();
2099 unsigned ArgSize = VA.getValVT().getStoreSize();
2100
2101 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
2102
2103 // Create load nodes to retrieve arguments from the stack.
2104 SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
2105 SDValue ArgValue;
2106
2107 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
2109 MVT MemVT = VA.getValVT();
2110
2111 switch (VA.getLocInfo()) {
2112 default:
2113 break;
2114 case CCValAssign::BCvt:
2115 MemVT = VA.getLocVT();
2116 break;
2117 case CCValAssign::SExt:
2118 ExtType = ISD::SEXTLOAD;
2119 break;
2120 case CCValAssign::ZExt:
2121 ExtType = ISD::ZEXTLOAD;
2122 break;
2123 case CCValAssign::AExt:
2124 ExtType = ISD::EXTLOAD;
2125 break;
2126 }
2127
2128 ArgValue = DAG.getExtLoad(
2129 ExtType, SL, VA.getLocVT(), Chain, FIN,
2131 MemVT);
2132 return ArgValue;
2133}
2134
2135SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG,
2136 const SIMachineFunctionInfo &MFI,
2137 EVT VT,
2139 const ArgDescriptor *Reg = nullptr;
2140 const TargetRegisterClass *RC;
2141 LLT Ty;
2142
2144 const ArgDescriptor WorkGroupIDX =
2145 ArgDescriptor::createRegister(AMDGPU::TTMP9);
2146 // If GridZ is not programmed in an entry function then the hardware will set
2147 // it to all zeros, so there is no need to mask the GridY value in the low
2148 // order bits.
2149 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
2150 AMDGPU::TTMP7,
2151 AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
2152 const ArgDescriptor WorkGroupIDZ =
2153 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
2154 if (Subtarget->hasArchitectedSGPRs() &&
2156 switch (PVID) {
2158 Reg = &WorkGroupIDX;
2159 RC = &AMDGPU::SReg_32RegClass;
2160 Ty = LLT::scalar(32);
2161 break;
2163 Reg = &WorkGroupIDY;
2164 RC = &AMDGPU::SReg_32RegClass;
2165 Ty = LLT::scalar(32);
2166 break;
2168 Reg = &WorkGroupIDZ;
2169 RC = &AMDGPU::SReg_32RegClass;
2170 Ty = LLT::scalar(32);
2171 break;
2172 default:
2173 break;
2174 }
2175 }
2176
2177 if (!Reg)
2178 std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
2179 if (!Reg) {
2181 // It's possible for a kernarg intrinsic call to appear in a kernel with
2182 // no allocated segment, in which case we do not add the user sgpr
2183 // argument, so just return null.
2184 return DAG.getConstant(0, SDLoc(), VT);
2185 }
2186
2187 // It's undefined behavior if a function marked with the amdgpu-no-*
2188 // attributes uses the corresponding intrinsic.
2189 return DAG.getUNDEF(VT);
2190 }
2191
2192 return loadInputValue(DAG, RC, VT, SDLoc(DAG.getEntryNode()), *Reg);
2193}
2194
2196 CallingConv::ID CallConv,
2197 ArrayRef<ISD::InputArg> Ins, BitVector &Skipped,
2198 FunctionType *FType,
2199 SIMachineFunctionInfo *Info) {
2200 for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
2201 const ISD::InputArg *Arg = &Ins[I];
2202
2203 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2204 "vector type argument should have been split");
2205
2206 // First check if it's a PS input addr.
2207 if (CallConv == CallingConv::AMDGPU_PS &&
2208 !Arg->Flags.isInReg() && PSInputNum <= 15) {
2209 bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
2210
2211 // Inconveniently only the first part of the split is marked as isSplit,
2212 // so skip to the end. We only want to increment PSInputNum once for the
2213 // entire split argument.
2214 if (Arg->Flags.isSplit()) {
2215 while (!Arg->Flags.isSplitEnd()) {
2216 assert((!Arg->VT.isVector() ||
2217 Arg->VT.getScalarSizeInBits() == 16) &&
2218 "unexpected vector split in ps argument type");
2219 if (!SkipArg)
2220 Splits.push_back(*Arg);
2221 Arg = &Ins[++I];
2222 }
2223 }
2224
2225 if (SkipArg) {
2226 // We can safely skip PS inputs.
2227 Skipped.set(Arg->getOrigArgIndex());
2228 ++PSInputNum;
2229 continue;
2230 }
2231
2232 Info->markPSInputAllocated(PSInputNum);
2233 if (Arg->Used)
2234 Info->markPSInputEnabled(PSInputNum);
2235
2236 ++PSInputNum;
2237 }
2238
2239 Splits.push_back(*Arg);
2240 }
2241}
2242
2243// Allocate special inputs passed in VGPRs.
2245 MachineFunction &MF,
2246 const SIRegisterInfo &TRI,
2247 SIMachineFunctionInfo &Info) const {
2248 const LLT S32 = LLT::scalar(32);
2250
2251 if (Info.hasWorkItemIDX()) {
2252 Register Reg = AMDGPU::VGPR0;
2253 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2254
2255 CCInfo.AllocateReg(Reg);
2256 unsigned Mask = (Subtarget->hasPackedTID() &&
2257 Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2258 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2259 }
2260
2261 if (Info.hasWorkItemIDY()) {
2262 assert(Info.hasWorkItemIDX());
2263 if (Subtarget->hasPackedTID()) {
2264 Info.setWorkItemIDY(ArgDescriptor::createRegister(AMDGPU::VGPR0,
2265 0x3ff << 10));
2266 } else {
2267 unsigned Reg = AMDGPU::VGPR1;
2268 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2269
2270 CCInfo.AllocateReg(Reg);
2271 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
2272 }
2273 }
2274
2275 if (Info.hasWorkItemIDZ()) {
2276 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2277 if (Subtarget->hasPackedTID()) {
2278 Info.setWorkItemIDZ(ArgDescriptor::createRegister(AMDGPU::VGPR0,
2279 0x3ff << 20));
2280 } else {
2281 unsigned Reg = AMDGPU::VGPR2;
2282 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2283
2284 CCInfo.AllocateReg(Reg);
2285 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
2286 }
2287 }
2288}
2289
2290// Try to allocate a VGPR at the end of the argument list, or if no argument
2291// VGPRs are left allocating a stack slot.
2292// If \p Mask is is given it indicates bitfield position in the register.
2293// If \p Arg is given use it with new ]p Mask instead of allocating new.
2294static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
2295 ArgDescriptor Arg = ArgDescriptor()) {
2296 if (Arg.isSet())
2297 return ArgDescriptor::createArg(Arg, Mask);
2298
2299 ArrayRef<MCPhysReg> ArgVGPRs = ArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
2300 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
2301 if (RegIdx == ArgVGPRs.size()) {
2302 // Spill to stack required.
2303 int64_t Offset = CCInfo.AllocateStack(4, Align(4));
2304
2305 return ArgDescriptor::createStack(Offset, Mask);
2306 }
2307
2308 unsigned Reg = ArgVGPRs[RegIdx];
2309 Reg = CCInfo.AllocateReg(Reg);
2310 assert(Reg != AMDGPU::NoRegister);
2311
2312 MachineFunction &MF = CCInfo.getMachineFunction();
2313 Register LiveInVReg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
2314 MF.getRegInfo().setType(LiveInVReg, LLT::scalar(32));
2315 return ArgDescriptor::createRegister(Reg, Mask);
2316}
2317
2319 const TargetRegisterClass *RC,
2320 unsigned NumArgRegs) {
2321 ArrayRef<MCPhysReg> ArgSGPRs = ArrayRef(RC->begin(), 32);
2322 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
2323 if (RegIdx == ArgSGPRs.size())
2324 report_fatal_error("ran out of SGPRs for arguments");
2325
2326 unsigned Reg = ArgSGPRs[RegIdx];
2327 Reg = CCInfo.AllocateReg(Reg);
2328 assert(Reg != AMDGPU::NoRegister);
2329
2330 MachineFunction &MF = CCInfo.getMachineFunction();
2331 MF.addLiveIn(Reg, RC);
2333}
2334
2335// If this has a fixed position, we still should allocate the register in the
2336// CCInfo state. Technically we could get away with this for values passed
2337// outside of the normal argument range.
2339 const TargetRegisterClass *RC,
2340 MCRegister Reg) {
2341 Reg = CCInfo.AllocateReg(Reg);
2342 assert(Reg != AMDGPU::NoRegister);
2343 MachineFunction &MF = CCInfo.getMachineFunction();
2344 MF.addLiveIn(Reg, RC);
2345}
2346
2347static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg) {
2348 if (Arg) {
2349 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_32RegClass,
2350 Arg.getRegister());
2351 } else
2352 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
2353}
2354
2355static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg) {
2356 if (Arg) {
2357 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_64RegClass,
2358 Arg.getRegister());
2359 } else
2360 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
2361}
2362
2363/// Allocate implicit function VGPR arguments at the end of allocated user
2364/// arguments.
2366 CCState &CCInfo, MachineFunction &MF,
2367 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2368 const unsigned Mask = 0x3ff;
2369 ArgDescriptor Arg;
2370
2371 if (Info.hasWorkItemIDX()) {
2372 Arg = allocateVGPR32Input(CCInfo, Mask);
2373 Info.setWorkItemIDX(Arg);
2374 }
2375
2376 if (Info.hasWorkItemIDY()) {
2377 Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg);
2378 Info.setWorkItemIDY(Arg);
2379 }
2380
2381 if (Info.hasWorkItemIDZ())
2382 Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg));
2383}
2384
2385/// Allocate implicit function VGPR arguments in fixed registers.
2387 CCState &CCInfo, MachineFunction &MF,
2388 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2389 Register Reg = CCInfo.AllocateReg(AMDGPU::VGPR31);
2390 if (!Reg)
2391 report_fatal_error("failed to allocated VGPR for implicit arguments");
2392
2393 const unsigned Mask = 0x3ff;
2394 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2395 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask << 10));
2396 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask << 20));
2397}
2398
2400 CCState &CCInfo,
2401 MachineFunction &MF,
2402 const SIRegisterInfo &TRI,
2403 SIMachineFunctionInfo &Info) const {
2404 auto &ArgInfo = Info.getArgInfo();
2405 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2406
2407 // TODO: Unify handling with private memory pointers.
2408 if (UserSGPRInfo.hasDispatchPtr())
2409 allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);
2410
2411 const Module *M = MF.getFunction().getParent();
2412 if (UserSGPRInfo.hasQueuePtr() &&
2414 allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
2415
2416 // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
2417 // constant offset from the kernarg segment.
2418 if (Info.hasImplicitArgPtr())
2419 allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr);
2420
2421 if (UserSGPRInfo.hasDispatchID())
2422 allocateSGPR64Input(CCInfo, ArgInfo.DispatchID);
2423
2424 // flat_scratch_init is not applicable for non-kernel functions.
2425
2426 if (Info.hasWorkGroupIDX())
2427 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX);
2428
2429 if (Info.hasWorkGroupIDY())
2430 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY);
2431
2432 if (Info.hasWorkGroupIDZ())
2433 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ);
2434
2435 if (Info.hasLDSKernelId())
2436 allocateSGPR32Input(CCInfo, ArgInfo.LDSKernelId);
2437}
2438
2439// Allocate special inputs passed in user SGPRs.
2441 MachineFunction &MF,
2442 const SIRegisterInfo &TRI,
2443 SIMachineFunctionInfo &Info) const {
2444 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2445 if (UserSGPRInfo.hasImplicitBufferPtr()) {
2446 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
2447 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2448 CCInfo.AllocateReg(ImplicitBufferPtrReg);
2449 }
2450
2451 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
2452 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
2453 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
2454 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2455 CCInfo.AllocateReg(PrivateSegmentBufferReg);
2456 }
2457
2458 if (UserSGPRInfo.hasDispatchPtr()) {
2459 Register DispatchPtrReg = Info.addDispatchPtr(TRI);
2460 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2461 CCInfo.AllocateReg(DispatchPtrReg);
2462 }
2463
2464 const Module *M = MF.getFunction().getParent();
2465 if (UserSGPRInfo.hasQueuePtr() &&
2467 Register QueuePtrReg = Info.addQueuePtr(TRI);
2468 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2469 CCInfo.AllocateReg(QueuePtrReg);
2470 }
2471
2472 if (UserSGPRInfo.hasKernargSegmentPtr()) {
2474 Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
2475 CCInfo.AllocateReg(InputPtrReg);
2476
2477 Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
2478 MRI.setType(VReg, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2479 }
2480
2481 if (UserSGPRInfo.hasDispatchID()) {
2482 Register DispatchIDReg = Info.addDispatchID(TRI);
2483 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2484 CCInfo.AllocateReg(DispatchIDReg);
2485 }
2486
2487 if (UserSGPRInfo.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
2488 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
2489 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2490 CCInfo.AllocateReg(FlatScratchInitReg);
2491 }
2492
2493 if (UserSGPRInfo.hasPrivateSegmentSize()) {
2494 Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(TRI);
2495 MF.addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
2496 CCInfo.AllocateReg(PrivateSegmentSizeReg);
2497 }
2498
2499 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
2500 // these from the dispatch pointer.
2501}
2502
2503// Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
2504// sequential starting from the first argument.
2506 CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
2508 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2509 Function &F = MF.getFunction();
2510 unsigned LastExplicitArgOffset =
2511 MF.getSubtarget<GCNSubtarget>().getExplicitKernelArgOffset();
2512 GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
2513 bool InPreloadSequence = true;
2514 unsigned InIdx = 0;
2515 for (auto &Arg : F.args()) {
2516 if (!InPreloadSequence || !Arg.hasInRegAttr())
2517 break;
2518
2519 int ArgIdx = Arg.getArgNo();
2520 // Don't preload non-original args or parts not in the current preload
2521 // sequence.
2522 if (InIdx < Ins.size() && (!Ins[InIdx].isOrigArg() ||
2523 (int)Ins[InIdx].getOrigArgIndex() != ArgIdx))
2524 break;
2525
2526 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2527 (int)Ins[InIdx].getOrigArgIndex() == ArgIdx;
2528 InIdx++) {
2529 assert(ArgLocs[ArgIdx].isMemLoc());
2530 auto &ArgLoc = ArgLocs[InIdx];
2531 const Align KernelArgBaseAlign = Align(16);
2532 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2533 Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);
2534 unsigned NumAllocSGPRs =
2535 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2536
2537 // Arg is preloaded into the previous SGPR.
2538 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2539 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2540 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2541 continue;
2542 }
2543
2544 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2545 unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
2546 // Check for free user SGPRs for preloading.
2547 if (PaddingSGPRs + NumAllocSGPRs + 1 /*Synthetic SGPRs*/ >
2548 SGPRInfo.getNumFreeUserSGPRs()) {
2549 InPreloadSequence = false;
2550 break;
2551 }
2552
2553 // Preload this argument.
2554 const TargetRegisterClass *RC =
2555 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
2556 SmallVectorImpl<MCRegister> *PreloadRegs =
2557 Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
2558
2559 if (PreloadRegs->size() > 1)
2560 RC = &AMDGPU::SGPR_32RegClass;
2561 for (auto &Reg : *PreloadRegs) {
2562 assert(Reg);
2563 MF.addLiveIn(Reg, RC);
2564 CCInfo.AllocateReg(Reg);
2565 }
2566
2567 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
2568 }
2569 }
2570}
2571
2573 const SIRegisterInfo &TRI,
2574 SIMachineFunctionInfo &Info) const {
2575 // Always allocate this last since it is a synthetic preload.
2576 if (Info.hasLDSKernelId()) {
2577 Register Reg = Info.addLDSKernelId();
2578 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2579 CCInfo.AllocateReg(Reg);
2580 }
2581}
2582
2583// Allocate special input registers that are initialized per-wave.
2585 MachineFunction &MF,
2587 CallingConv::ID CallConv,
2588 bool IsShader) const {
2589 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
2590 if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) {
2591 // Note: user SGPRs are handled by the front-end for graphics shaders
2592 // Pad up the used user SGPRs with dead inputs.
2593
2594 // TODO: NumRequiredSystemSGPRs computation should be adjusted appropriately
2595 // before enabling architected SGPRs for workgroup IDs.
2596 assert(!HasArchitectedSGPRs && "Unhandled feature for the subtarget");
2597
2598 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
2599 // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to
2600 // rely on it to reach 16 since if we end up having no stack usage, it will
2601 // not really be added.
2602 unsigned NumRequiredSystemSGPRs = Info.hasWorkGroupIDX() +
2603 Info.hasWorkGroupIDY() +
2604 Info.hasWorkGroupIDZ() +
2605 Info.hasWorkGroupInfo();
2606 for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
2607 Register Reg = Info.addReservedUserSGPR();
2608 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2609 CCInfo.AllocateReg(Reg);
2610 }
2611 }
2612
2613 if (!HasArchitectedSGPRs) {
2614 if (Info.hasWorkGroupIDX()) {
2615 Register Reg = Info.addWorkGroupIDX();
2616 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2617 CCInfo.AllocateReg(Reg);
2618 }
2619
2620 if (Info.hasWorkGroupIDY()) {
2621 Register Reg = Info.addWorkGroupIDY();
2622 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2623 CCInfo.AllocateReg(Reg);
2624 }
2625
2626 if (Info.hasWorkGroupIDZ()) {
2627 Register Reg = Info.addWorkGroupIDZ();
2628 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2629 CCInfo.AllocateReg(Reg);
2630 }
2631 }
2632
2633 if (Info.hasWorkGroupInfo()) {
2634 Register Reg = Info.addWorkGroupInfo();
2635 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2636 CCInfo.AllocateReg(Reg);
2637 }
2638
2639 if (Info.hasPrivateSegmentWaveByteOffset()) {
2640 // Scratch wave offset passed in system SGPR.
2641 unsigned PrivateSegmentWaveByteOffsetReg;
2642
2643 if (IsShader) {
2644 PrivateSegmentWaveByteOffsetReg =
2645 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
2646
2647 // This is true if the scratch wave byte offset doesn't have a fixed
2648 // location.
2649 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
2650 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
2651 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
2652 }
2653 } else
2654 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
2655
2656 MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
2657 CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
2658 }
2659
2660 assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader ||
2661 Info.getNumPreloadedSGPRs() >= 16);
2662}
2663
2665 MachineFunction &MF,
2666 const SIRegisterInfo &TRI,
2667 SIMachineFunctionInfo &Info) {
2668 // Now that we've figured out where the scratch register inputs are, see if
2669 // should reserve the arguments and use them directly.
2670 MachineFrameInfo &MFI = MF.getFrameInfo();
2671 bool HasStackObjects = MFI.hasStackObjects();
2672 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2673
2674 // Record that we know we have non-spill stack objects so we don't need to
2675 // check all stack objects later.
2676 if (HasStackObjects)
2677 Info.setHasNonSpillStackObjects(true);
2678
2679 // Everything live out of a block is spilled with fast regalloc, so it's
2680 // almost certain that spilling will be required.
2681 if (TM.getOptLevel() == CodeGenOptLevel::None)
2682 HasStackObjects = true;
2683
2684 // For now assume stack access is needed in any callee functions, so we need
2685 // the scratch registers to pass in.
2686 bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
2687
2688 if (!ST.enableFlatScratch()) {
2689 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
2690 // If we have stack objects, we unquestionably need the private buffer
2691 // resource. For the Code Object V2 ABI, this will be the first 4 user
2692 // SGPR inputs. We can reserve those and use them directly.
2693
2694 Register PrivateSegmentBufferReg =
2696 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
2697 } else {
2698 unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
2699 // We tentatively reserve the last registers (skipping the last registers
2700 // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
2701 // we'll replace these with the ones immediately after those which were
2702 // really allocated. In the prologue copies will be inserted from the
2703 // argument to these reserved registers.
2704
2705 // Without HSA, relocations are used for the scratch pointer and the
2706 // buffer resource setup is always inserted in the prologue. Scratch wave
2707 // offset is still in an input SGPR.
2708 Info.setScratchRSrcReg(ReservedBufferReg);
2709 }
2710 }
2711
2713
2714 // For entry functions we have to set up the stack pointer if we use it,
2715 // whereas non-entry functions get this "for free". This means there is no
2716 // intrinsic advantage to using S32 over S34 in cases where we do not have
2717 // calls but do need a frame pointer (i.e. if we are requested to have one
2718 // because frame pointer elimination is disabled). To keep things simple we
2719 // only ever use S32 as the call ABI stack pointer, and so using it does not
2720 // imply we need a separate frame pointer.
2721 //
2722 // Try to use s32 as the SP, but move it if it would interfere with input
2723 // arguments. This won't work with calls though.
2724 //
2725 // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
2726 // registers.
2727 if (!MRI.isLiveIn(AMDGPU::SGPR32)) {
2728 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
2729 } else {
2731
2732 if (MFI.hasCalls())
2733 report_fatal_error("call in graphics shader with too many input SGPRs");
2734
2735 for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
2736 if (!MRI.isLiveIn(Reg)) {
2737 Info.setStackPtrOffsetReg(Reg);
2738 break;
2739 }
2740 }
2741
2742 if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
2743 report_fatal_error("failed to find register for SP");
2744 }
2745
2746 // hasFP should be accurate for entry functions even before the frame is
2747 // finalized, because it does not rely on the known stack size, only
2748 // properties like whether variable sized objects are present.
2749 if (ST.getFrameLowering()->hasFP(MF)) {
2750 Info.setFrameOffsetReg(AMDGPU::SGPR33);
2751 }
2752}
2753
2756 return !Info->isEntryFunction();
2757}
2758
2760
2761}
2762
2764 MachineBasicBlock *Entry,
2765 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
2767
2768 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
2769 if (!IStart)
2770 return;
2771
2772 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2773 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
2774 MachineBasicBlock::iterator MBBI = Entry->begin();
2775 for (const MCPhysReg *I = IStart; *I; ++I) {
2776 const TargetRegisterClass *RC = nullptr;
2777 if (AMDGPU::SReg_64RegClass.contains(*I))
2778 RC = &AMDGPU::SGPR_64RegClass;
2779 else if (AMDGPU::SReg_32RegClass.contains(*I))
2780 RC = &AMDGPU::SGPR_32RegClass;
2781 else
2782 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2783
2784 Register NewVR = MRI->createVirtualRegister(RC);
2785 // Create copy from CSR to a virtual register.
2786 Entry->addLiveIn(*I);
2787 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
2788 .addReg(*I);
2789
2790 // Insert the copy-back instructions right before the terminator.
2791 for (auto *Exit : Exits)
2792 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
2793 TII->get(TargetOpcode::COPY), *I)
2794 .addReg(NewVR);
2795 }
2796}
2797
2799 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2800 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
2801 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2803
2805 const Function &Fn = MF.getFunction();
2808
2809 if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) {
2810 DiagnosticInfoUnsupported NoGraphicsHSA(
2811 Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
2812 DAG.getContext()->diagnose(NoGraphicsHSA);
2813 return DAG.getEntryNode();
2814 }
2815
2818 BitVector Skipped(Ins.size());
2819 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
2820 *DAG.getContext());
2821
2822 bool IsGraphics = AMDGPU::isGraphics(CallConv);
2823 bool IsKernel = AMDGPU::isKernel(CallConv);
2824 bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
2825
2826 if (IsGraphics) {
2827 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo();
2828 assert(!UserSGPRInfo.hasDispatchPtr() &&
2829 !UserSGPRInfo.hasKernargSegmentPtr() && !Info->hasWorkGroupInfo() &&
2830 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
2831 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
2832 (void)UserSGPRInfo;
2833 if (!Subtarget->enableFlatScratch())
2834 assert(!UserSGPRInfo.hasFlatScratchInit());
2835 if ((CallConv != CallingConv::AMDGPU_CS &&
2836 CallConv != CallingConv::AMDGPU_Gfx) ||
2837 !Subtarget->hasArchitectedSGPRs())
2838 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
2839 !Info->hasWorkGroupIDZ());
2840 }
2841
2842 if (CallConv == CallingConv::AMDGPU_PS) {
2843 processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
2844
2845 // At least one interpolation mode must be enabled or else the GPU will
2846 // hang.
2847 //
2848 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
2849 // set PSInputAddr, the user wants to enable some bits after the compilation
2850 // based on run-time states. Since we can't know what the final PSInputEna
2851 // will look like, so we shouldn't do anything here and the user should take
2852 // responsibility for the correct programming.
2853 //
2854 // Otherwise, the following restrictions apply:
2855 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
2856 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
2857 // enabled too.
2858 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
2859 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
2860 CCInfo.AllocateReg(AMDGPU::VGPR0);
2861 CCInfo.AllocateReg(AMDGPU::VGPR1);
2862 Info->markPSInputAllocated(0);
2863 Info->markPSInputEnabled(0);
2864 }
2865 if (Subtarget->isAmdPalOS()) {
2866 // For isAmdPalOS, the user does not enable some bits after compilation
2867 // based on run-time states; the register values being generated here are
2868 // the final ones set in hardware. Therefore we need to apply the
2869 // workaround to PSInputAddr and PSInputEnable together. (The case where
2870 // a bit is set in PSInputAddr but not PSInputEnable is where the
2871 // frontend set up an input arg for a particular interpolation mode, but
2872 // nothing uses that input arg. Really we should have an earlier pass
2873 // that removes such an arg.)
2874 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
2875 if ((PsInputBits & 0x7F) == 0 ||
2876 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
2877 Info->markPSInputEnabled(llvm::countr_zero(Info->getPSInputAddr()));
2878 }
2879 } else if (IsKernel) {
2880 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
2881 } else {
2882 Splits.append(Ins.begin(), Ins.end());
2883 }
2884
2885 if (IsKernel)
2886 analyzeFormalArgumentsCompute(CCInfo, Ins);
2887
2888 if (IsEntryFunc) {
2889 allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
2890 allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
2891 if (IsKernel && Subtarget->hasKernargPreload())
2892 allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, *TRI, *Info);
2893
2894 allocateLDSKernelId(CCInfo, MF, *TRI, *Info);
2895 } else if (!IsGraphics) {
2896 // For the fixed ABI, pass workitem IDs in the last argument register.
2897 allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
2898
2899 // FIXME: Sink this into allocateSpecialInputSGPRs
2900 if (!Subtarget->enableFlatScratch())
2901 CCInfo.AllocateReg(Info->getScratchRSrcReg());
2902
2903 allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
2904 }
2905
2906 if (!IsKernel) {
2907 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
2908 CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
2909 }
2910
2912
2913 // FIXME: This is the minimum kernel argument alignment. We should improve
2914 // this to the maximum alignment of the arguments.
2915 //
2916 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
2917 // kern arg offset.
2918 const Align KernelArgBaseAlign = Align(16);
2919
2920 for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
2921 const ISD::InputArg &Arg = Ins[i];
2922 if (Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) {
2923 InVals.push_back(DAG.getUNDEF(Arg.VT));
2924 continue;
2925 }
2926
2927 CCValAssign &VA = ArgLocs[ArgIdx++];
2928 MVT VT = VA.getLocVT();
2929
2930 if (IsEntryFunc && VA.isMemLoc()) {
2931 VT = Ins[i].VT;
2932 EVT MemVT = VA.getLocVT();
2933
2934 const uint64_t Offset = VA.getLocMemOffset();
2935 Align Alignment = commonAlignment(KernelArgBaseAlign, Offset);
2936
2937 if (Arg.Flags.isByRef()) {
2938 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, Chain, Offset);
2939
2940 const GCNTargetMachine &TM =
2941 static_cast<const GCNTargetMachine &>(getTargetMachine());
2942 if (!TM.isNoopAddrSpaceCast(AMDGPUAS::CONSTANT_ADDRESS,
2943 Arg.Flags.getPointerAddrSpace())) {
2946 }
2947
2948 InVals.push_back(Ptr);
2949 continue;
2950 }
2951
2952 SDValue NewArg;
2953 if (Arg.isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
2954 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2955 // In this case the argument is packed into the previous preload SGPR.
2956 int64_t AlignDownOffset = alignDown(Offset, 4);
2957 int64_t OffsetDiff = Offset - AlignDownOffset;
2958 EVT IntVT = MemVT.changeTypeToInteger();
2959
2963 Register Reg =
2964 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
2965
2966 assert(Reg);
2967 Register VReg = MRI.getLiveInVirtReg(Reg);
2968 SDValue Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
2969
2970 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, DL, MVT::i32);
2971 SDValue Extract = DAG.getNode(ISD::SRL, DL, MVT::i32, Copy, ShiftAmt);
2972
2973 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Extract);
2974 ArgVal = DAG.getNode(ISD::BITCAST, DL, MemVT, ArgVal);
2975 NewArg = convertArgType(DAG, VT, MemVT, DL, ArgVal,
2976 Ins[i].Flags.isSExt(), &Ins[i]);
2977
2978 NewArg = DAG.getMergeValues({NewArg, Copy.getValue(1)}, DL);
2979 } else {
2983 const SmallVectorImpl<MCRegister> &PreloadRegs =
2984 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
2985
2986 SDValue Copy;
2987 if (PreloadRegs.size() == 1) {
2988 Register VReg = MRI.getLiveInVirtReg(PreloadRegs[0]);
2989 const TargetRegisterClass *RC = MRI.getRegClass(VReg);
2990 NewArg = DAG.getCopyFromReg(
2991 Chain, DL, VReg,
2993 TRI->getRegSizeInBits(*RC)));
2994
2995 } else {
2996 // If the kernarg alignment does not match the alignment of the SGPR
2997 // tuple RC that can accommodate this argument, it will be built up
2998 // via copies from from the individual SGPRs that the argument was
2999 // preloaded to.
3001 for (auto Reg : PreloadRegs) {
3002 Register VReg = MRI.getLiveInVirtReg(Reg);
3003 Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
3004 Elts.push_back(Copy);
3005 }
3006 NewArg =
3007 DAG.getBuildVector(EVT::getVectorVT(*DAG.getContext(), MVT::i32,
3008 PreloadRegs.size()),
3009 DL, Elts);
3010 }
3011
3012 // If the argument was preloaded to multiple consecutive 32-bit
3013 // registers because of misalignment between addressable SGPR tuples
3014 // and the argument size, we can still assume that because of kernarg
3015 // segment alignment restrictions that NewArg's size is the same as
3016 // MemVT and just do a bitcast. If MemVT is less than 32-bits we add a
3017 // truncate since we cannot preload to less than a single SGPR and the
3018 // MemVT may be smaller.
3019 EVT MemVTInt =
3021 if (MemVT.bitsLT(NewArg.getSimpleValueType()))
3022 NewArg = DAG.getNode(ISD::TRUNCATE, DL, MemVTInt, NewArg);
3023
3024 NewArg = DAG.getBitcast(MemVT, NewArg);
3025 NewArg = convertArgType(DAG, VT, MemVT, DL, NewArg,
3026 Ins[i].Flags.isSExt(), &Ins[i]);
3027 NewArg = DAG.getMergeValues({NewArg, Chain}, DL);
3028 }
3029 } else {
3030 NewArg =
3031 lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset,
3032 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3033 }
3034 Chains.push_back(NewArg.getValue(1));
3035
3036 auto *ParamTy =
3037 dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
3039 ParamTy && (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
3040 ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
3041 // On SI local pointers are just offsets into LDS, so they are always
3042 // less than 16-bits. On CI and newer they could potentially be
3043 // real pointers, so we can't guarantee their size.
3044 NewArg = DAG.getNode(ISD::AssertZext, DL, NewArg.getValueType(), NewArg,
3045 DAG.getValueType(MVT::i16));
3046 }
3047
3048 InVals.push_back(NewArg);
3049 continue;
3050 }
3051 if (!IsEntryFunc && VA.isMemLoc()) {
3052 SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
3053 InVals.push_back(Val);
3054 if (!Arg.Flags.isByVal())
3055 Chains.push_back(Val.getValue(1));
3056 continue;
3057 }
3058
3059 assert(VA.isRegLoc() && "Parameter must be in a register!");
3060
3061 Register Reg = VA.getLocReg();
3062 const TargetRegisterClass *RC = nullptr;
3063 if (AMDGPU::VGPR_32RegClass.contains(Reg))
3064 RC = &AMDGPU::VGPR_32RegClass;
3065 else if (AMDGPU::SGPR_32RegClass.contains(Reg))
3066 RC = &AMDGPU::SGPR_32RegClass;
3067 else
3068 llvm_unreachable("Unexpected register class in LowerFormalArguments!");
3069 EVT ValVT = VA.getValVT();
3070
3071 Reg = MF.addLiveIn(Reg, RC);
3072 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
3073
3074 if (Arg.Flags.isSRet()) {
3075 // The return object should be reasonably addressable.
3076
3077 // FIXME: This helps when the return is a real sret. If it is a
3078 // automatically inserted sret (i.e. CanLowerReturn returns false), an
3079 // extra copy is inserted in SelectionDAGBuilder which obscures this.
3080 unsigned NumBits
3082 Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
3083 DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
3084 }
3085
3086 // If this is an 8 or 16-bit value, it is really passed promoted
3087 // to 32 bits. Insert an assert[sz]ext to capture this, then
3088 // truncate to the right size.
3089 switch (VA.getLocInfo()) {
3090 case CCValAssign::Full:
3091 break;
3092 case CCValAssign::BCvt:
3093 Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val);
3094 break;
3095 case CCValAssign::SExt:
3096 Val = DAG.getNode(ISD::AssertSext, DL, VT, Val,
3097 DAG.getValueType(ValVT));
3098 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3099 break;
3100 case CCValAssign::ZExt:
3101 Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
3102 DAG.getValueType(ValVT));
3103 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3104 break;
3105 case CCValAssign::AExt:
3106 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3107 break;
3108 default:
3109 llvm_unreachable("Unknown loc info!");
3110 }
3111
3112 InVals.push_back(Val);
3113 }
3114
3115 // Start adding system SGPRs.
3116 if (IsEntryFunc)
3117 allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
3118
3119 // DAG.getPass() returns nullptr when using new pass manager.
3120 // TODO: Use DAG.getMFAM() to access analysis result.
3121 if (DAG.getPass()) {
3122 auto &ArgUsageInfo = DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
3123 ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
3124 }
3125
3126 unsigned StackArgSize = CCInfo.getStackSize();
3127 Info->setBytesInStackArgArea(StackArgSize);
3128
3129 return Chains.empty() ? Chain :
3130 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
3131}
3132
3133// TODO: If return values can't fit in registers, we should return as many as
3134// possible in registers before passing on stack.
3136 CallingConv::ID CallConv,
3137 MachineFunction &MF, bool IsVarArg,
3139 LLVMContext &Context) const {
3140 // Replacing returns with sret/stack usage doesn't make sense for shaders.
3141 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
3142 // for shaders. Vector types should be explicitly handled by CC.
3143 if (AMDGPU::isEntryFunctionCC(CallConv))
3144 return true;
3145
3147 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3148 if (!CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg)))
3149 return false;
3150
3151 // We must use the stack if return would require unavailable registers.
3152 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3153 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
3154 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3155 if (CCInfo.isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3156 return false;
3157
3158 return true;
3159}
3160
3161SDValue
3163 bool isVarArg,
3165 const SmallVectorImpl<SDValue> &OutVals,
3166 const SDLoc &DL, SelectionDAG &DAG) const {
3169
3170 if (AMDGPU::isKernel(CallConv)) {
3171 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
3172 OutVals, DL, DAG);
3173 }
3174
3175 bool IsShader = AMDGPU::isShader(CallConv);
3176
3177 Info->setIfReturnsVoid(Outs.empty());
3178 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3179
3180 // CCValAssign - represent the assignment of the return value to a location.
3183
3184 // CCState - Info about the registers and stack slots.
3185 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3186 *DAG.getContext());
3187
3188 // Analyze outgoing return values.
3189 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3190
3191 SDValue Glue;
3193 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
3194
3195 // Copy the result values into the output registers.
3196 for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
3197 ++I, ++RealRVLocIdx) {
3198 CCValAssign &VA = RVLocs[I];
3199 assert(VA.isRegLoc() && "Can only return in registers!");
3200 // TODO: Partially return in registers if return values don't fit.
3201 SDValue Arg = OutVals[RealRVLocIdx];
3202
3203 // Copied from other backends.
3204 switch (VA.getLocInfo()) {
3205 case CCValAssign::Full:
3206 break;
3207 case CCValAssign::BCvt:
3208 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3209 break;
3210 case CCValAssign::SExt:
3211 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3212 break;
3213 case CCValAssign::ZExt:
3214 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3215 break;
3216 case CCValAssign::AExt:
3217 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3218 break;
3219 default:
3220 llvm_unreachable("Unknown loc info!");
3221 }
3222
3223 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Glue);
3224 Glue = Chain.getValue(1);
3225 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3226 }
3227
3228 // FIXME: Does sret work properly?
3229 if (!Info->isEntryFunction()) {
3230 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3231 const MCPhysReg *I =
3232 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3233 if (I) {
3234 for (; *I; ++I) {
3235 if (AMDGPU::SReg_64RegClass.contains(*I))
3236 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
3237 else if (AMDGPU::SReg_32RegClass.contains(*I))
3238 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
3239 else
3240 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3241 }
3242 }
3243 }
3244
3245 // Update chain and glue.
3246 RetOps[0] = Chain;
3247 if (Glue.getNode())
3248 RetOps.push_back(Glue);
3249
3250 unsigned Opc = AMDGPUISD::ENDPGM;
3251 if (!IsWaveEnd)
3253 return DAG.getNode(Opc, DL, MVT::Other, RetOps);
3254}
3255
3257 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool IsVarArg,
3258 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3259 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
3260 SDValue ThisVal) const {
3261 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
3262
3263 // Assign locations to each value returned by this call.
3265 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
3266 *DAG.getContext());
3267 CCInfo.AnalyzeCallResult(Ins, RetCC);
3268
3269 // Copy all of the result registers out of their specified physreg.
3270 for (CCValAssign VA : RVLocs) {
3271 SDValue Val;
3272
3273 if (VA.isRegLoc()) {
3274 Val = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
3275 Chain = Val.getValue(1);
3276 InGlue = Val.getValue(2);
3277 } else if (VA.isMemLoc()) {
3278 report_fatal_error("TODO: return values in memory");
3279 } else
3280 llvm_unreachable("unknown argument location type");
3281
3282 switch (VA.getLocInfo()) {
3283 case CCValAssign::Full:
3284 break;
3285 case CCValAssign::BCvt:
3286 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
3287 break;
3288 case CCValAssign::ZExt:
3289 Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
3290 DAG.getValueType(VA.getValVT()));
3291 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3292 break;
3293 case CCValAssign::SExt:
3294 Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
3295 DAG.getValueType(VA.getValVT()));
3296 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3297 break;
3298 case CCValAssign::AExt:
3299 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3300 break;
3301 default:
3302 llvm_unreachable("Unknown loc info!");
3303 }
3304
3305 InVals.push_back(Val);
3306 }
3307
3308 return Chain;
3309}
3310
3311// Add code to pass special inputs required depending on used features separate
3312// from the explicit user arguments present in the IR.
3314 CallLoweringInfo &CLI,
3315 CCState &CCInfo,
3316 const SIMachineFunctionInfo &Info,
3317 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
3318 SmallVectorImpl<SDValue> &MemOpChains,
3319 SDValue Chain) const {
3320 // If we don't have a call site, this was a call inserted by
3321 // legalization. These can never use special inputs.
3322 if (!CLI.CB)
3323 return;
3324
3325 SelectionDAG &DAG = CLI.DAG;
3326 const SDLoc &DL = CLI.DL;
3327 const Function &F = DAG.getMachineFunction().getFunction();
3328
3329 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3330 const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
3331
3332 const AMDGPUFunctionArgInfo *CalleeArgInfo
3334 if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) {
3335 // DAG.getPass() returns nullptr when using new pass manager.
3336 // TODO: Use DAG.getMFAM() to access analysis result.
3337 if (DAG.getPass()) {
3338 auto &ArgUsageInfo =
3340 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
3341 }
3342 }
3343
3344 // TODO: Unify with private memory register handling. This is complicated by
3345 // the fact that at least in kernels, the input argument is not necessarily
3346 // in the same location as the input.
3347 static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue,
3349 {AMDGPUFunctionArgInfo::DISPATCH_PTR, "amdgpu-no-dispatch-ptr"},
3350 {AMDGPUFunctionArgInfo::QUEUE_PTR, "amdgpu-no-queue-ptr" },
3351 {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, "amdgpu-no-implicitarg-ptr"},
3352 {AMDGPUFunctionArgInfo::DISPATCH_ID, "amdgpu-no-dispatch-id"},
3353 {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, "amdgpu-no-workgroup-id-x"},
3354 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,"amdgpu-no-workgroup-id-y"},
3355 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,"amdgpu-no-workgroup-id-z"},
3356 {AMDGPUFunctionArgInfo::LDS_KERNEL_ID,"amdgpu-no-lds-kernel-id"},
3357 };
3358
3359 for (auto Attr : ImplicitAttrs) {
3360 const ArgDescriptor *OutgoingArg;
3361 const TargetRegisterClass *ArgRC;
3362 LLT ArgTy;
3363
3364 AMDGPUFunctionArgInfo::PreloadedValue InputID = Attr.first;
3365
3366 // If the callee does not use the attribute value, skip copying the value.
3367 if (CLI.CB->hasFnAttr(Attr.second))
3368 continue;
3369
3370 std::tie(OutgoingArg, ArgRC, ArgTy) =
3371 CalleeArgInfo->getPreloadedValue(InputID);
3372 if (!OutgoingArg)
3373 continue;
3374
3375 const ArgDescriptor *IncomingArg;
3376 const TargetRegisterClass *IncomingArgRC;
3377 LLT Ty;
3378 std::tie(IncomingArg, IncomingArgRC, Ty) =
3379 CallerArgInfo.getPreloadedValue(InputID);
3380 assert(IncomingArgRC == ArgRC);
3381
3382 // All special arguments are ints for now.
3383 EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3384 SDValue InputReg;
3385
3386 if (IncomingArg) {
3387 InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
3388 } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
3389 // The implicit arg ptr is special because it doesn't have a corresponding
3390 // input for kernels, and is computed from the kernarg segment pointer.
3391 InputReg = getImplicitArgPtr(DAG, DL);
3392 } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {
3393 std::optional<uint32_t> Id =
3395 if (Id.has_value()) {
3396 InputReg = DAG.getConstant(*Id, DL, ArgVT);
3397 } else {
3398 InputReg = DAG.getUNDEF(ArgVT);
3399 }
3400 } else {
3401 // We may have proven the input wasn't needed, although the ABI is
3402 // requiring it. We just need to allocate the register appropriately.
3403 InputReg = DAG.getUNDEF(ArgVT);
3404 }
3405
3406 if (OutgoingArg->isRegister()) {
3407 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3408 if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
3409 report_fatal_error("failed to allocate implicit input argument");
3410 } else {
3411 unsigned SpecialArgOffset =
3412 CCInfo.AllocateStack(ArgVT.getStoreSize(), Align(4));
3413 SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
3414 SpecialArgOffset);
3415 MemOpChains.push_back(ArgStore);
3416 }
3417 }
3418
3419 // Pack workitem IDs into a single register or pass it as is if already
3420 // packed.
3421 const ArgDescriptor *OutgoingArg;
3422 const TargetRegisterClass *ArgRC;
3423 LLT Ty;
3424
3425 std::tie(OutgoingArg, ArgRC, Ty) =
3427 if (!OutgoingArg)
3428 std::tie(OutgoingArg, ArgRC, Ty) =
3430 if (!OutgoingArg)
3431 std::tie(OutgoingArg, ArgRC, Ty) =
3433 if (!OutgoingArg)
3434 return;
3435
3436 const ArgDescriptor *IncomingArgX = std::get<0>(
3438 const ArgDescriptor *IncomingArgY = std::get<0>(
3440 const ArgDescriptor *IncomingArgZ = std::get<0>(
3442
3443 SDValue InputReg;
3444 SDLoc SL;
3445
3446 const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-x");
3447 const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-y");
3448 const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-z");
3449
3450 // If incoming ids are not packed we need to pack them.
3451 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX &&
3452 NeedWorkItemIDX) {
3453 if (Subtarget->getMaxWorkitemID(F, 0) != 0) {
3454 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
3455 } else {
3456 InputReg = DAG.getConstant(0, DL, MVT::i32);
3457 }
3458 }
3459
3460 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY &&
3461 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(F, 1) != 0) {
3462 SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY);
3463 Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y,
3464 DAG.getShiftAmountConstant(10, MVT::i32, SL));
3465 InputReg = InputReg.getNode() ?
3466 DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y) : Y;
3467 }
3468
3469 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ &&
3470 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(F, 2) != 0) {
3471 SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ);
3472 Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z,
3473 DAG.getShiftAmountConstant(20, MVT::i32, SL));
3474 InputReg = InputReg.getNode() ?
3475 DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z) : Z;
3476 }
3477
3478 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3479 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3480 // We're in a situation where the outgoing function requires the workitem
3481 // ID, but the calling function does not have it (e.g a graphics function
3482 // calling a C calling convention function). This is illegal, but we need
3483 // to produce something.
3484 InputReg = DAG.getUNDEF(MVT::i32);
3485 } else {
3486 // Workitem ids are already packed, any of present incoming arguments
3487 // will carry all required fields.
3489 IncomingArgX ? *IncomingArgX :
3490 IncomingArgY ? *IncomingArgY :
3491 *IncomingArgZ, ~0u);
3492 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg);
3493 }
3494 }
3495
3496 if (OutgoingArg->isRegister()) {
3497 if (InputReg)
3498 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3499
3500 CCInfo.AllocateReg(OutgoingArg->getRegister());
3501 } else {
3502 unsigned SpecialArgOffset = CCInfo.AllocateStack(4, Align(4));
3503 if (InputReg) {
3504 SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
3505 SpecialArgOffset);
3506 MemOpChains.push_back(ArgStore);
3507 }
3508 }
3509}
3510
3512 return CC == CallingConv::Fast;
3513}
3514
3515/// Return true if we might ever do TCO for calls with this calling convention.
3517 switch (CC) {
3518 case CallingConv::C:
3520 return true;
3521 default:
3522 return canGuaranteeTCO(CC);
3523 }
3524}
3525
3527 SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
3529 const SmallVectorImpl<SDValue> &OutVals,
3530 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3531 if (AMDGPU::isChainCC(CalleeCC))
3532 return true;
3533
3534 if (!mayTailCallThisCC(CalleeCC))
3535 return false;
3536
3537 // For a divergent call target, we need to do a waterfall loop over the
3538 // possible callees which precludes us from using a simple jump.
3539 if (Callee->isDivergent())
3540 return false;
3541
3543 const Function &CallerF = MF.getFunction();
3544 CallingConv::ID CallerCC = CallerF.getCallingConv();
3546 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3547
3548 // Kernels aren't callable, and don't have a live in return address so it
3549 // doesn't make sense to do a tail call with entry functions.
3550 if (!CallerPreserved)
3551 return false;
3552
3553 bool CCMatch = CallerCC == CalleeCC;
3554
3556 if (canGuaranteeTCO(CalleeCC) && CCMatch)
3557 return true;
3558 return false;
3559 }
3560
3561 // TODO: Can we handle var args?
3562 if (IsVarArg)
3563 return false;
3564
3565 for (const Argument &Arg : CallerF.args()) {
3566 if (Arg.hasByValAttr())
3567 return false;
3568 }
3569
3570 LLVMContext &Ctx = *DAG.getContext();
3571
3572 // Check that the call results are passed in the same way.
3573 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
3574 CCAssignFnForCall(CalleeCC, IsVarArg),
3575 CCAssignFnForCall(CallerCC, IsVarArg)))
3576 return false;
3577
3578 // The callee has to preserve all registers the caller needs to preserve.
3579 if (!CCMatch) {
3580 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
3581 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3582 return false;
3583 }
3584
3585 // Nothing more to check if the callee is taking no arguments.
3586 if (Outs.empty())
3587 return true;
3588
3590 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
3591
3592 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
3593
3594 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
3595 // If the stack arguments for this call do not fit into our own save area then
3596 // the call cannot be made tail.
3597 // TODO: Is this really necessary?
3598 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
3599 return false;
3600
3601 const MachineRegisterInfo &MRI = MF.getRegInfo();
3602 return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
3603}
3604
3606 if (!CI->isTailCall())
3607 return false;
3608
3609 const Function *ParentFn = CI->getParent()->getParent();
3611 return false;
3612 return true;
3613}
3614
3615// The wave scratch offset register is used as the global base pointer.
3617 SmallVectorImpl<SDValue> &InVals) const {
3618 CallingConv::ID CallConv = CLI.CallConv;
3619 bool IsChainCallConv = AMDGPU::isChainCC(CallConv);
3620
3621 SelectionDAG &DAG = CLI.DAG;
3622
3623 TargetLowering::ArgListEntry RequestedExec;
3624 if (IsChainCallConv) {
3625 // The last argument should be the value that we need to put in EXEC.
3626 // Pop it out of CLI.Outs and CLI.OutVals before we do any processing so we
3627 // don't treat it like the rest of the arguments.
3628 RequestedExec = CLI.Args.back();
3629 assert(RequestedExec.Node && "No node for EXEC");
3630
3631 if (!RequestedExec.Ty->isIntegerTy(Subtarget->getWavefrontSize()))
3632 return lowerUnhandledCall(CLI, InVals, "Invalid value for EXEC");
3633
3634 assert(CLI.Outs.back().OrigArgIndex == 2 && "Unexpected last arg");
3635 CLI.Outs.pop_back();
3636 CLI.OutVals.pop_back();
3637
3638 if (RequestedExec.Ty->isIntegerTy(64)) {
3639 assert(CLI.Outs.back().OrigArgIndex == 2 && "Exec wasn't split up");
3640 CLI.Outs.pop_back();
3641 CLI.OutVals.pop_back();
3642 }
3643
3644 assert(CLI.Outs.back().OrigArgIndex != 2 &&
3645 "Haven't popped all the pieces of the EXEC mask");
3646 }
3647
3648 const SDLoc &DL = CLI.DL;
3650 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
3652 SDValue Chain = CLI.Chain;
3653 SDValue Callee = CLI.Callee;
3654 bool &IsTailCall = CLI.IsTailCall;
3655 bool IsVarArg = CLI.IsVarArg;
3656 bool IsSibCall = false;
3658
3659 if (Callee.isUndef() || isNullConstant(Callee)) {
3660 if (!CLI.IsTailCall) {
3661 for (ISD::InputArg &Arg : CLI.Ins)
3662 InVals.push_back(DAG.getUNDEF(Arg.VT));
3663 }
3664
3665 return Chain;
3666 }
3667
3668 if (IsVarArg) {
3669 return lowerUnhandledCall(CLI, InVals,
3670 "unsupported call to variadic function ");
3671 }
3672
3673 if (!CLI.CB)
3674 report_fatal_error("unsupported libcall legalization");
3675
3676 if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
3677 return lowerUnhandledCall(CLI, InVals,
3678 "unsupported required tail call to function ");
3679 }
3680
3681 if (IsTailCall) {
3683 Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
3684 if (!IsTailCall &&
3685 ((CLI.CB && CLI.CB->isMustTailCall()) || IsChainCallConv)) {
3686 report_fatal_error("failed to perform tail call elimination on a call "
3687 "site marked musttail or on llvm.amdgcn.cs.chain");
3688 }
3689
3690 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
3691
3692 // A sibling call is one where we're under the usual C ABI and not planning
3693 // to change that but can still do a tail call:
3694 if (!TailCallOpt && IsTailCall)
3695 IsSibCall = true;
3696
3697 if (IsTailCall)
3698 ++NumTailCalls;
3699 }
3700
3703 SmallVector<SDValue, 8> MemOpChains;
3704
3705 // Analyze operands of the call, assigning locations to each operand.
3707 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
3708 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
3709
3710 if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv)) {
3711 // With a fixed ABI, allocate fixed registers before user arguments.
3712 passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
3713 }
3714
3715 CCInfo.AnalyzeCallOperands(Outs, AssignFn);
3716
3717 // Get a count of how many bytes are to be pushed on the stack.
3718 unsigned NumBytes = CCInfo.getStackSize();
3719
3720 if (IsSibCall) {
3721 // Since we're not changing the ABI to make this a tail call, the memory
3722 // operands are already available in the caller's incoming argument space.
3723 NumBytes = 0;
3724 }
3725
3726 // FPDiff is the byte offset of the call's argument area from the callee's.
3727 // Stores to callee stack arguments will be placed in FixedStackSlots offset
3728 // by this amount for a tail call. In a sibling call it must be 0 because the
3729 // caller will deallocate the entire stack and the callee still expects its
3730 // arguments to begin at SP+0. Completely unused for non-tail calls.
3731 int32_t FPDiff = 0;
3732 MachineFrameInfo &MFI = MF.getFrameInfo();
3733
3734 // Adjust the stack pointer for the new arguments...
3735 // These operations are automatically eliminated by the prolog/epilog pass
3736 if (!IsSibCall)
3737 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
3738
3739 if (!IsSibCall || IsChainCallConv) {
3740 if (!Subtarget->enableFlatScratch()) {
3741 SmallVector<SDValue, 4> CopyFromChains;
3742
3743 // In the HSA case, this should be an identity copy.
3744 SDValue ScratchRSrcReg
3745 = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
3746 RegsToPass.emplace_back(IsChainCallConv
3747 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
3748 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
3749 ScratchRSrcReg);
3750 CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
3751 Chain = DAG.getTokenFactor(DL, CopyFromChains);
3752 }
3753 }
3754
3755 MVT PtrVT = MVT::i32;
3756
3757 // Walk the register/memloc assignments, inserting copies/loads.
3758 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3759 CCValAssign &VA = ArgLocs[i];
3760 SDValue Arg = OutVals[i];
3761
3762 // Promote the value if needed.
3763 switch (VA.getLocInfo()) {
3764 case CCValAssign::Full:
3765 break;
3766 case CCValAssign::BCvt:
3767 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3768 break;
3769 case CCValAssign::ZExt:
3770 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3771 break;
3772 case CCValAssign::SExt:
3773 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3774 break;
3775 case CCValAssign::AExt:
3776 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3777 break;
3778 case CCValAssign::FPExt:
3779 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
3780 break;
3781 default:
3782 llvm_unreachable("Unknown loc info!");
3783 }
3784
3785 if (VA.isRegLoc()) {
3786 RegsToPass.push_back(std::pair(VA.getLocReg(), Arg));
3787 } else {
3788 assert(VA.isMemLoc());
3789
3790 SDValue DstAddr;
3791 MachinePointerInfo DstInfo;
3792
3793 unsigned LocMemOffset = VA.getLocMemOffset();
3794 int32_t Offset = LocMemOffset;
3795
3796 SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
3797 MaybeAlign Alignment;
3798
3799 if (IsTailCall) {
3800 ISD::ArgFlagsTy Flags = Outs[i].Flags;
3801 unsigned OpSize = Flags.isByVal() ?
3802 Flags.getByValSize() : VA.getValVT().getStoreSize();
3803
3804 // FIXME: We can have better than the minimum byval required alignment.
3805 Alignment =
3806 Flags.isByVal()
3807 ? Flags.getNonZeroByValAlign()
3808 : commonAlignment(Subtarget->getStackAlignment(), Offset);
3809
3810 Offset = Offset + FPDiff;
3811 int FI = MFI.CreateFixedObject(OpSize, Offset, true);
3812
3813 DstAddr = DAG.getFrameIndex(FI, PtrVT);
3814 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
3815
3816 // Make sure any stack arguments overlapping with where we're storing
3817 // are loaded before this eventual operation. Otherwise they'll be
3818 // clobbered.
3819
3820 // FIXME: Why is this really necessary? This seems to just result in a
3821 // lot of code to copy the stack and write them back to the same
3822 // locations, which are supposed to be immutable?
3823 Chain = addTokenForArgument(Chain, DAG, MFI, FI);
3824 } else {
3825 // Stores to the argument stack area are relative to the stack pointer.
3826 SDValue SP = DAG.getCopyFromReg(Chain, DL, Info->getStackPtrOffsetReg(),
3827 MVT::i32);
3828 DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, SP, PtrOff);
3829 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
3830 Alignment =
3831 commonAlignment(Subtarget->getStackAlignment(), LocMemOffset);
3832 }
3833
3834 if (Outs[i].Flags.isByVal()) {
3835 SDValue SizeNode =
3836 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
3837 SDValue Cpy =
3838 DAG.getMemcpy(Chain, DL, DstAddr, Arg, SizeNode,
3839 Outs[i].Flags.getNonZeroByValAlign(),
3840 /*isVol = */ false, /*AlwaysInline = */ true,
3841 /*CI=*/nullptr, std::nullopt, DstInfo,
3843
3844 MemOpChains.push_back(Cpy);
3845 } else {
3846 SDValue Store =
3847 DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Alignment);
3848 MemOpChains.push_back(Store);
3849 }
3850 }
3851 }
3852
3853 if (!MemOpChains.empty())
3854 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
3855
3856 // Build a sequence of copy-to-reg nodes chained together with token chain
3857 // and flag operands which copy the outgoing args into the appropriate regs.
3858 SDValue InGlue;
3859 for (auto &RegToPass : RegsToPass) {
3860 Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
3861 RegToPass.second, InGlue);
3862 InGlue = Chain.getValue(1);
3863 }
3864
3865
3866 // We don't usually want to end the call-sequence here because we would tidy
3867 // the frame up *after* the call, however in the ABI-changing tail-call case
3868 // we've carefully laid out the parameters so that when sp is reset they'll be
3869 // in the correct location.
3870 if (IsTailCall && !IsSibCall) {
3871 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, 0, InGlue, DL);
3872 InGlue = Chain.getValue(1);
3873 }
3874
3875 std::vector<SDValue> Ops;
3876 Ops.push_back(Chain);
3877 Ops.push_back(Callee);
3878 // Add a redundant copy of the callee global which will not be legalized, as
3879 // we need direct access to the callee later.
3880 if (GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(Callee)) {
3881 const GlobalValue *GV = GSD->getGlobal();
3882 Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
3883 } else {
3884 Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
3885 }
3886
3887 if (IsTailCall) {
3888 // Each tail call may have to adjust the stack by a different amount, so
3889 // this information must travel along with the operation for eventual
3890 // consumption by emitEpilogue.
3891 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
3892 }
3893
3894 if (IsChainCallConv)
3895 Ops.push_back(RequestedExec.Node);
3896
3897 // Add argument registers to the end of the list so that they are known live
3898 // into the call.
3899 for (auto &RegToPass : RegsToPass) {
3900 Ops.push_back(DAG.getRegister(RegToPass.first,
3901 RegToPass.second.getValueType()));
3902 }
3903
3904 // Add a register mask operand representing the call-preserved registers.
3905 auto *TRI = static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
3906 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
3907 assert(Mask && "Missing call preserved mask for calling convention");
3908 Ops.push_back(DAG.getRegisterMask(Mask));
3909
3910 if (SDValue Token = CLI.ConvergenceControlToken) {
3912 GlueOps.push_back(Token);
3913 if (InGlue)
3914 GlueOps.push_back(InGlue);
3915
3916 InGlue = SDValue(DAG.getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, DL,
3917 MVT::Glue, GlueOps),
3918 0);
3919 }
3920
3921 if (InGlue)
3922 Ops.push_back(InGlue);
3923
3924 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3925
3926 // If we're doing a tall call, use a TC_RETURN here rather than an
3927 // actual call instruction.
3928 if (IsTailCall) {
3929 MFI.setHasTailCall();
3930 unsigned OPC = AMDGPUISD::TC_RETURN;
3931 switch (CallConv) {
3934 break;
3938 break;
3939 }
3940
3941 return DAG.getNode(OPC, DL, NodeTys, Ops);
3942 }
3943
3944 // Returns a chain and a flag for retval copy to use.
3945 SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, NodeTys, Ops);
3946 Chain = Call.getValue(0);
3947 InGlue = Call.getValue(1);
3948
3949 uint64_t CalleePopBytes = NumBytes;
3950 Chain = DAG.getCALLSEQ_END(Chain, 0, CalleePopBytes, InGlue, DL);
3951 if (!Ins.empty())
3952 InGlue = Chain.getValue(1);
3953
3954 // Handle result values, copying them out of physregs into vregs that we
3955 // return.
3956 return LowerCallResult(Chain, InGlue, CallConv, IsVarArg, Ins, DL, DAG,
3957 InVals, /*IsThisReturn=*/false, SDValue());
3958}
3959
3960// This is identical to the default implementation in ExpandDYNAMIC_STACKALLOC,
3961// except for applying the wave size scale to the increment amount.
3963 SDValue Op, SelectionDAG &DAG) const {
3964 const MachineFunction &MF = DAG.getMachineFunction();
3966
3967 SDLoc dl(Op);
3968 EVT VT = Op.getValueType();
3969 SDValue Tmp1 = Op;
3970 SDValue Tmp2 = Op.getValue(1);
3971 SDValue Tmp3 = Op.getOperand(2);
3972 SDValue Chain = Tmp1.getOperand(0);
3973
3974 Register SPReg = Info->getStackPtrOffsetReg();
3975
3976 // Chain the dynamic stack allocation so that it doesn't modify the stack
3977 // pointer when other instructions are using the stack.
3978 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
3979
3980 SDValue Size = Tmp2.getOperand(1);
3981 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
3982 Chain = SP.getValue(1);
3983 MaybeAlign Alignment = cast<ConstantSDNode>(Tmp3)->getMaybeAlignValue();
3984 const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
3985 unsigned Opc =
3988
3989 SDValue ScaledSize = DAG.getNode(
3990 ISD::SHL, dl, VT, Size,
3991 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
3992
3993 Align StackAlign = TFL->getStackAlign();
3994 Tmp1 = DAG.getNode(Opc, dl, VT, SP, ScaledSize); // Value
3995 if (Alignment && *Alignment > StackAlign) {
3996 Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1,
3997 DAG.getConstant(-(uint64_t)Alignment->value()
3998 << Subtarget->getWavefrontSizeLog2(),
3999 dl, VT));
4000 }
4001
4002 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain
4003 Tmp2 = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
4004
4005 return DAG.getMergeValues({Tmp1, Tmp2}, dl);
4006}
4007
4009 SelectionDAG &DAG) const {
4010 // We only handle constant sizes here to allow non-entry block, static sized
4011 // allocas. A truly dynamic value is more difficult to support because we
4012 // don't know if the size value is uniform or not. If the size isn't uniform,
4013 // we would need to do a wave reduction to get the maximum size to know how
4014 // much to increment the uniform stack pointer.
4015 SDValue Size = Op.getOperand(1);
4016 if (isa<ConstantSDNode>(Size))
4017 return lowerDYNAMIC_STACKALLOCImpl(Op, DAG); // Use "generic" expansion.
4018
4020}
4021
4023 if (Op.getValueType() != MVT::i32)
4024 return Op; // Defer to cannot select error.
4025
4027 SDLoc SL(Op);
4028
4029 SDValue CopyFromSP = DAG.getCopyFromReg(Op->getOperand(0), SL, SP, MVT::i32);
4030
4031 // Convert from wave uniform to swizzled vector address. This should protect
4032 // from any edge cases where the stacksave result isn't directly used with
4033 // stackrestore.
4034 SDValue VectorAddress =
4035 DAG.getNode(AMDGPUISD::WAVE_ADDRESS, SL, MVT::i32, CopyFromSP);
4036 return DAG.getMergeValues({VectorAddress, CopyFromSP.getValue(1)}, SL);
4037}
4038
4040 SelectionDAG &DAG) const {
4041 SDLoc SL(Op);
4042 assert(Op.getValueType() == MVT::i32);
4043
4044 uint32_t BothRoundHwReg =
4046 SDValue GetRoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4047
4048 SDValue IntrinID =
4049 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4050 SDValue GetReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, Op->getVTList(),
4051 Op.getOperand(0), IntrinID, GetRoundBothImm);
4052
4053 // There are two rounding modes, one for f32 and one for f64/f16. We only
4054 // report in the standard value range if both are the same.
4055 //
4056 // The raw values also differ from the expected FLT_ROUNDS values. Nearest
4057 // ties away from zero is not supported, and the other values are rotated by
4058 // 1.
4059 //
4060 // If the two rounding modes are not the same, report a target defined value.
4061
4062 // Mode register rounding mode fields:
4063 //
4064 // [1:0] Single-precision round mode.
4065 // [3:2] Double/Half-precision round mode.
4066 //
4067 // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero.
4068 //
4069 // Hardware Spec
4070 // Toward-0 3 0
4071 // Nearest Even 0 1
4072 // +Inf 1 2
4073 // -Inf 2 3
4074 // NearestAway0 N/A 4
4075 //
4076 // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit
4077 // table we can index by the raw hardware mode.
4078 //
4079 // (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf
4080
4081 SDValue BitTable =
4083
4084 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4085 SDValue RoundModeTimesNumBits =
4086 DAG.getNode(ISD::SHL, SL, MVT::i32, GetReg, Two);
4087
4088 // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we
4089 // knew only one mode was demanded.
4090 SDValue TableValue =
4091 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4092 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4093
4094 SDValue EntryMask = DAG.getConstant(0xf, SL, MVT::i32);
4095 SDValue TableEntry =
4096 DAG.getNode(ISD::AND, SL, MVT::i32, TruncTable, EntryMask);
4097
4098 // There's a gap in the 4-bit encoded table and actual enum values, so offset
4099 // if it's an extended value.
4100 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4101 SDValue IsStandardValue =
4102 DAG.getSetCC(SL, MVT::i1, TableEntry, Four, ISD::SETULT);
4103 SDValue EnumOffset = DAG.getNode(ISD::ADD, SL, MVT::i32, TableEntry, Four);
4104 SDValue Result = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue,
4105 TableEntry, EnumOffset);
4106
4107 return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL);
4108}
4109
4111 SelectionDAG &DAG) const {
4112 SDLoc SL(Op);
4113
4114 SDValue NewMode = Op.getOperand(1);
4115 assert(NewMode.getValueType() == MVT::i32);
4116
4117 // Index a table of 4-bit entries mapping from the C FLT_ROUNDS values to the
4118 // hardware MODE.fp_round values.
4119 if (auto *ConstMode = dyn_cast<ConstantSDNode>(NewMode)) {
4120 uint32_t ClampedVal = std::min(
4121 static_cast<uint32_t>(ConstMode->getZExtValue()),
4123 NewMode = DAG.getConstant(
4124 AMDGPU::decodeFltRoundToHWConversionTable(ClampedVal), SL, MVT::i32);
4125 } else {
4126 // If we know the input can only be one of the supported standard modes in
4127 // the range 0-3, we can use a simplified mapping to hardware values.
4128 KnownBits KB = DAG.computeKnownBits(NewMode);
4129 const bool UseReducedTable = KB.countMinLeadingZeros() >= 30;
4130 // The supported standard values are 0-3. The extended values start at 8. We
4131 // need to offset by 4 if the value is in the extended range.
4132
4133 if (UseReducedTable) {
4134 // Truncate to the low 32-bits.
4135 SDValue BitTable = DAG.getConstant(
4136 AMDGPU::FltRoundToHWConversionTable & 0xffff, SL, MVT::i32);
4137
4138 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4139 SDValue RoundModeTimesNumBits =
4140 DAG.getNode(ISD::SHL, SL, MVT::i32, NewMode, Two);
4141
4142 NewMode =
4143 DAG.getNode(ISD::SRL, SL, MVT::i32, BitTable, RoundModeTimesNumBits);
4144
4145 // TODO: SimplifyDemandedBits on the setreg source here can likely reduce
4146 // the table extracted bits into inline immediates.
4147 } else {
4148 // table_index = umin(value, value - 4)
4149 // MODE.fp_round = (bit_table >> (table_index << 2)) & 0xf
4150 SDValue BitTable =
4152
4153 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4154 SDValue OffsetEnum = DAG.getNode(ISD::SUB, SL, MVT::i32, NewMode, Four);
4155 SDValue IndexVal =
4156 DAG.getNode(ISD::UMIN, SL, MVT::i32, NewMode, OffsetEnum);
4157
4158 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4159 SDValue RoundModeTimesNumBits =
4160 DAG.getNode(ISD::SHL, SL, MVT::i32, IndexVal, Two);
4161
4162 SDValue TableValue =
4163 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4164 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4165
4166 // No need to mask out the high bits since the setreg will ignore them
4167 // anyway.
4168 NewMode = TruncTable;
4169 }
4170
4171 // Insert a readfirstlane in case the value is a VGPR. We could do this
4172 // earlier and keep more operations scalar, but that interferes with
4173 // combining the source.
4174 SDValue ReadFirstLaneID =
4175 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4176 NewMode = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4177 ReadFirstLaneID, NewMode);
4178 }
4179
4180 // N.B. The setreg will be later folded into s_round_mode on supported
4181 // targets.
4182 SDValue IntrinID =
4183 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4184 uint32_t BothRoundHwReg =
4186 SDValue RoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4187
4188 SDValue SetReg =
4189 DAG.getNode(ISD::INTRINSIC_VOID, SL, Op->getVTList(), Op.getOperand(0),
4190 IntrinID, RoundBothImm, NewMode);
4191
4192 return SetReg;
4193}
4194
4196 if (Op->isDivergent())
4197 return SDValue();
4198
4199 switch (cast<MemSDNode>(Op)->getAddressSpace()) {
4204 break;
4205 default:
4206 return SDValue();
4207 }
4208
4209 return Op;
4210}
4211
4212// Work around DAG legality rules only based on the result type.
4214 bool IsStrict = Op.getOpcode() == ISD::STRICT_FP_EXTEND;
4215 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
4216 EVT SrcVT = Src.getValueType();
4217
4218 if (SrcVT.getScalarType() != MVT::bf16)
4219 return Op;
4220
4221 SDLoc SL(Op);
4222 SDValue BitCast =
4223 DAG.getNode(ISD::BITCAST, SL, SrcVT.changeTypeToInteger(), Src);
4224
4225 EVT DstVT = Op.getValueType();
4226 if (IsStrict)
4227 llvm_unreachable("Need STRICT_BF16_TO_FP");
4228
4229 return DAG.getNode(ISD::BF16_TO_FP, SL, DstVT, BitCast);
4230}
4231
4233 SDLoc SL(Op);
4234 if (Op.getValueType() != MVT::i64)
4235 return Op;
4236
4237 uint32_t ModeHwReg =
4239 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4240 uint32_t TrapHwReg =
4242 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4243
4244 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::Other);
4245 SDValue IntrinID =
4246 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4247 SDValue GetModeReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4248 Op.getOperand(0), IntrinID, ModeHwRegImm);
4249 SDValue GetTrapReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4250 Op.getOperand(0), IntrinID, TrapHwRegImm);
4251 SDValue TokenReg =
4252 DAG.getNode(ISD::TokenFactor, SL, MVT::Other, GetModeReg.getValue(1),
4253 GetTrapReg.getValue(1));
4254
4255 SDValue CvtPtr =
4256 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, GetModeReg, GetTrapReg);
4257 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
4258
4259 return DAG.getMergeValues({Result, TokenReg}, SL);
4260}
4261
4263 SDLoc SL(Op);
4264 if (Op.getOperand(1).getValueType() != MVT::i64)
4265 return Op;
4266
4267 SDValue Input = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op.getOperand(1));
4268 SDValue NewModeReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4269 DAG.getConstant(0, SL, MVT::i32));
4270 SDValue NewTrapReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4271 DAG.getConstant(1, SL, MVT::i32));
4272
4273 SDValue ReadFirstLaneID =
4274 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4275 NewModeReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4276 ReadFirstLaneID, NewModeReg);
4277 NewTrapReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4278 ReadFirstLaneID, NewTrapReg);
4279
4280 unsigned ModeHwReg =
4282 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4283 unsigned TrapHwReg =
4285 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4286
4287 SDValue IntrinID =
4288 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4289 SDValue SetModeReg =
4290 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4291 IntrinID, ModeHwRegImm, NewModeReg);
4292 SDValue SetTrapReg =
4293 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4294 IntrinID, TrapHwRegImm, NewTrapReg);
4295 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, SetTrapReg, SetModeReg);
4296}
4297
4299 const MachineFunction &MF) const {
4301 .Case("m0", AMDGPU::M0)
4302 .Case("exec", AMDGPU::EXEC)
4303 .Case("exec_lo", AMDGPU::EXEC_LO)
4304 .Case("exec_hi", AMDGPU::EXEC_HI)
4305 .Case("flat_scratch", AMDGPU::FLAT_SCR)
4306 .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4307 .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4308 .Default(Register());
4309
4310 if (Reg == AMDGPU::NoRegister) {
4311 report_fatal_error(Twine("invalid register name \""
4312 + StringRef(RegName) + "\"."));
4313
4314 }
4315
4316 if (!Subtarget->hasFlatScrRegister() &&
4317 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
4318 report_fatal_error(Twine("invalid register \""
4319 + StringRef(RegName) + "\" for subtarget."));
4320 }
4321
4322 switch (Reg) {
4323 case AMDGPU::M0:
4324 case AMDGPU::EXEC_LO:
4325 case AMDGPU::EXEC_HI:
4326 case AMDGPU::FLAT_SCR_LO:
4327 case AMDGPU::FLAT_SCR_HI:
4328 if (VT.getSizeInBits() == 32)
4329 return Reg;
4330 break;
4331 case AMDGPU::EXEC:
4332 case AMDGPU::FLAT_SCR:
4333 if (VT.getSizeInBits() == 64)
4334 return Reg;
4335 break;
4336 default:
4337 llvm_unreachable("missing register type checking");
4338 }
4339
4340 report_fatal_error(Twine("invalid type for register \""
4341 + StringRef(RegName) + "\"."));
4342}
4343
4344// If kill is not the last instruction, split the block so kill is always a
4345// proper terminator.
4348 MachineBasicBlock *BB) const {
4349 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
4351 MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
4352 return SplitBB;
4353}
4354
4355// Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
4356// \p MI will be the only instruction in the loop body block. Otherwise, it will
4357// be the first instruction in the remainder block.
4358//
4359/// \returns { LoopBody, Remainder }
4360static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4364
4365 // To insert the loop we need to split the block. Move everything after this
4366 // point to a new block, and insert a new empty block between the two.
4368 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
4370 ++MBBI;
4371
4372 MF->insert(MBBI, LoopBB);
4373 MF->insert(MBBI, RemainderBB);
4374
4375 LoopBB->addSuccessor(LoopBB);
4376 LoopBB->addSuccessor(RemainderBB);
4377
4378 // Move the rest of the block into a new block.
4379 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
4380
4381 if (InstInLoop) {
4382 auto Next = std::next(I);
4383
4384 // Move instruction to loop body.
4385 LoopBB->splice(LoopBB->begin(), &MBB, I, Next);
4386
4387 // Move the rest of the block.
4388 RemainderBB->splice(RemainderBB->begin(), &MBB, Next, MBB.end());
4389 } else {
4390 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
4391 }
4392
4393 MBB.addSuccessor(LoopBB);
4394
4395 return std::pair(LoopBB, RemainderBB);
4396}
4397
4398/// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
4400 MachineBasicBlock *MBB = MI.getParent();
4402 auto I = MI.getIterator();
4403 auto E = std::next(I);
4404
4405 BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
4406 .addImm(0);
4407
4408 MIBundleBuilder Bundler(*MBB, I, E);
4409 finalizeBundle(*MBB, Bundler.begin());
4410}
4411
4414 MachineBasicBlock *BB) const {
4415 const DebugLoc &DL = MI.getDebugLoc();
4416
4418
4419 MachineBasicBlock *LoopBB;
4420 MachineBasicBlock *RemainderBB;
4422
4423 // Apparently kill flags are only valid if the def is in the same block?
4424 if (MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0))
4425 Src->setIsKill(false);
4426
4427 std::tie(LoopBB, RemainderBB) = splitBlockForLoop(MI, *BB, true);
4428
4429 MachineBasicBlock::iterator I = LoopBB->end();
4430
4431 const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
4433
4434 // Clear TRAP_STS.MEM_VIOL
4435 BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
4436 .addImm(0)
4437 .addImm(EncodedReg);
4438
4440
4441 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4442
4443 // Load and check TRAP_STS.MEM_VIOL
4444 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg)
4445 .addImm(EncodedReg);
4446
4447 // FIXME: Do we need to use an isel pseudo that may clobber scc?
4448 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32))
4449 .addReg(Reg, RegState::Kill)
4450 .addImm(0);
4451 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
4452 .addMBB(LoopBB);
4453
4454 return RemainderBB;
4455}
4456
4457// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
4458// wavefront. If the value is uniform and just happens to be in a VGPR, this
4459// will only do one iteration. In the worst case, this will loop 64 times.
4460//
4461// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
4464 MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
4465 const DebugLoc &DL, const MachineOperand &Idx,
4466 unsigned InitReg, unsigned ResultReg, unsigned PhiReg,
4467 unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode,
4468 Register &SGPRIdxReg) {
4469
4470 MachineFunction *MF = OrigBB.getParent();
4471 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4472 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4474
4475 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
4476 Register PhiExec = MRI.createVirtualRegister(BoolRC);
4477 Register NewExec = MRI.createVirtualRegister(BoolRC);
4478 Register CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4479 Register CondReg = MRI.createVirtualRegister(BoolRC);
4480
4481 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
4482 .addReg(InitReg)
4483 .addMBB(&OrigBB)
4484 .addReg(ResultReg)
4485 .addMBB(&LoopBB);
4486
4487 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
4488 .addReg(InitSaveExecReg)
4489 .addMBB(&OrigBB)
4490 .addReg(NewExec)
4491 .addMBB(&LoopBB);
4492
4493 // Read the next variant <- also loop target.
4494 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
4495 .addReg(Idx.getReg(), getUndefRegState(Idx.isUndef()));
4496
4497 // Compare the just read M0 value to all possible Idx values.
4498 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
4499 .addReg(CurrentIdxReg)
4500 .addReg(Idx.getReg(), 0, Idx.getSubReg());
4501
4502 // Update EXEC, save the original EXEC value to VCC.
4503 BuildMI(LoopBB, I, DL, TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
4504 : AMDGPU::S_AND_SAVEEXEC_B64),
4505 NewExec)
4506 .addReg(CondReg, RegState::Kill);
4507
4508 MRI.setSimpleHint(NewExec, CondReg);
4509
4510 if (UseGPRIdxMode) {
4511 if (Offset == 0) {
4512 SGPRIdxReg = CurrentIdxReg;
4513 } else {
4514 SGPRIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4515 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
4516 .addReg(CurrentIdxReg, RegState::Kill)
4517 .addImm(Offset);
4518 }
4519 } else {
4520 // Move index from VCC into M0
4521 if (Offset == 0) {
4522 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
4523 .addReg(CurrentIdxReg, RegState::Kill);
4524 } else {
4525 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
4526 .addReg(CurrentIdxReg, RegState::Kill)
4527 .addImm(Offset);
4528 }
4529 }
4530
4531 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
4532 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4533 MachineInstr *InsertPt =
4534 BuildMI(LoopBB, I, DL, TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term
4535 : AMDGPU::S_XOR_B64_term), Exec)
4536 .addReg(Exec)
4537 .addReg(NewExec);
4538
4539 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
4540 // s_cbranch_scc0?
4541
4542 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
4543 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
4544 .addMBB(&LoopBB);
4545
4546 return InsertPt->getIterator();
4547}
4548
4549// This has slightly sub-optimal regalloc when the source vector is killed by
4550// the read. The register allocator does not understand that the kill is
4551// per-workitem, so is kept alive for the whole loop so we end up not re-using a
4552// subregister from it, using 1 more VGPR than necessary. This was saved when
4553// this was expanded after register allocation.
4556 unsigned InitResultReg, unsigned PhiReg, int Offset,
4557 bool UseGPRIdxMode, Register &SGPRIdxReg) {
4559 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4560 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4562 const DebugLoc &DL = MI.getDebugLoc();
4564
4565 const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
4566 Register DstReg = MI.getOperand(0).getReg();
4567 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
4568 Register TmpExec = MRI.createVirtualRegister(BoolXExecRC);
4569 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4570 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4571
4572 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
4573
4574 // Save the EXEC mask
4575 BuildMI(MBB, I, DL, TII->get(MovExecOpc), SaveExec)
4576 .addReg(Exec);
4577
4578 MachineBasicBlock *LoopBB;
4579 MachineBasicBlock *RemainderBB;
4580 std::tie(LoopBB, RemainderBB) = splitBlockForLoop(MI, MBB, false);
4581
4582 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4583
4584 auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
4585 InitResultReg, DstReg, PhiReg, TmpExec,
4586 Offset, UseGPRIdxMode, SGPRIdxReg);
4587
4588 MachineBasicBlock* LandingPad = MF->CreateMachineBasicBlock();
4590 ++MBBI;
4591 MF->insert(MBBI, LandingPad);
4592 LoopBB->removeSuccessor(RemainderBB);
4593 LandingPad->addSuccessor(RemainderBB);
4594 LoopBB->addSuccessor(LandingPad);
4595 MachineBasicBlock::iterator First = LandingPad->begin();
4596 BuildMI(*LandingPad, First, DL, TII->get(MovExecOpc), Exec)
4597 .addReg(SaveExec);
4598
4599 return InsPt;
4600}
4601
4602// Returns subreg index, offset
4603static std::pair<unsigned, int>
4605 const TargetRegisterClass *SuperRC,
4606 unsigned VecReg,
4607 int Offset) {
4608 int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
4609
4610 // Skip out of bounds offsets, or else we would end up using an undefined
4611 // register.
4612 if (Offset >= NumElts || Offset < 0)
4613 return std::pair(AMDGPU::sub0, Offset);
4614
4615 return std::pair(SIRegisterInfo::getSubRegFromChannel(Offset), 0);
4616}
4617
4620 int Offset) {
4621 MachineBasicBlock *MBB = MI.getParent();
4622 const DebugLoc &DL = MI.getDebugLoc();
4624
4625 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4626
4627 assert(Idx->getReg() != AMDGPU::NoRegister);
4628
4629 if (Offset == 0) {
4630 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0).add(*Idx);
4631 } else {
4632 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
4633 .add(*Idx)
4634 .addImm(Offset);
4635 }
4636}
4637
4640 int Offset) {
4641 MachineBasicBlock *MBB = MI.getParent();
4642 const DebugLoc &DL = MI.getDebugLoc();
4644
4645 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4646
4647 if (Offset == 0)
4648 return Idx->getReg();
4649
4650 Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4651 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
4652 .add(*Idx)
4653 .addImm(Offset);
4654 return Tmp;
4655}
4656
4659 const GCNSubtarget &ST) {
4660 const SIInstrInfo *TII = ST.getInstrInfo();
4661 const SIRegisterInfo &TRI = TII->getRegisterInfo();
4664
4665 Register Dst = MI.getOperand(0).getReg();
4666 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4667 Register SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
4668 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
4669
4670 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
4671 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
4672
4673 unsigned SubReg;
4674 std::tie(SubReg, Offset)
4675 = computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
4676
4677 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4678
4679 // Check for a SGPR index.
4680 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4682 const DebugLoc &DL = MI.getDebugLoc();
4683
4684 if (UseGPRIdxMode) {
4685 // TODO: Look at the uses to avoid the copy. This may require rescheduling
4686 // to avoid interfering with other uses, so probably requires a new
4687 // optimization pass.
4689
4690 const MCInstrDesc &GPRIDXDesc =
4691 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
4692 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
4693 .addReg(SrcReg)
4694 .addReg(Idx)
4695 .addImm(SubReg);
4696 } else {
4698
4699 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
4700 .addReg(SrcReg, 0, SubReg)
4701 .addReg(SrcReg, RegState::Implicit);
4702 }
4703
4704 MI.eraseFromParent();
4705
4706 return &MBB;
4707 }
4708
4709 // Control flow needs to be inserted if indexing with a VGPR.
4710 const DebugLoc &DL = MI.getDebugLoc();
4712
4713 Register PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4714 Register InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4715
4716 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
4717
4718 Register SGPRIdxReg;
4719 auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset,
4720 UseGPRIdxMode, SGPRIdxReg);
4721
4722 MachineBasicBlock *LoopBB = InsPt->getParent();
4723
4724 if (UseGPRIdxMode) {
4725 const MCInstrDesc &GPRIDXDesc =
4726 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
4727
4728 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
4729 .addReg(SrcReg)
4730 .addReg(SGPRIdxReg)
4731 .addImm(SubReg);
4732 } else {
4733 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
4734 .addReg(SrcReg, 0, SubReg)
4735 .addReg(SrcReg, RegState::Implicit);
4736 }
4737
4738 MI.eraseFromParent();
4739
4740 return LoopBB;
4741}
4742
4745 const GCNSubtarget &ST) {
4746 const SIInstrInfo *TII = ST.getInstrInfo();
4747 const SIRegisterInfo &TRI = TII->getRegisterInfo();
4750
4751 Register Dst = MI.getOperand(0).getReg();
4752 const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
4753 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4754 const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
4755 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
4756 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
4757 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
4758
4759 // This can be an immediate, but will be folded later.
4760 assert(Val->getReg());
4761
4762 unsigned SubReg;
4763 std::tie(SubReg, Offset) = computeIndirectRegAndOffset(TRI, VecRC,
4764 SrcVec->getReg(),
4765 Offset);
4766 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4767
4768 if (Idx->getReg() == AMDGPU::NoRegister) {
4770 const DebugLoc &DL = MI.getDebugLoc();
4771
4772 assert(Offset == 0);
4773
4774 BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
4775 .add(*SrcVec)
4776 .add(*Val)
4777 .addImm(SubReg);
4778
4779 MI.eraseFromParent();
4780 return &MBB;
4781 }
4782
4783 // Check for a SGPR index.
4784 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4786 const DebugLoc &DL = MI.getDebugLoc();
4787
4788 if (UseGPRIdxMode) {
4790
4791 const MCInstrDesc &GPRIDXDesc =
4792 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
4793 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
4794 .addReg(SrcVec->getReg())
4795 .add(*Val)
4796 .addReg(Idx)
4797 .addImm(SubReg);
4798 } else {
4800
4801 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
4802 TRI.getRegSizeInBits(*VecRC), 32, false);
4803 BuildMI(MBB, I, DL, MovRelDesc, Dst)
4804 .addReg(SrcVec->getReg())
4805 .add(*Val)
4806 .addImm(SubReg);
4807 }
4808 MI.eraseFromParent();
4809 return &MBB;
4810 }
4811
4812 // Control flow needs to be inserted if indexing with a VGPR.
4813 if (Val->isReg())
4814 MRI.clearKillFlags(Val->getReg());
4815
4816 const DebugLoc &DL = MI.getDebugLoc();
4817
4818 Register PhiReg = MRI.createVirtualRegister(VecRC);
4819
4820 Register SGPRIdxReg;
4821 auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, Offset,
4822 UseGPRIdxMode, SGPRIdxReg);
4823 MachineBasicBlock *LoopBB = InsPt->getParent();
4824
4825 if (UseGPRIdxMode) {
4826 const MCInstrDesc &GPRIDXDesc =
4827 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
4828
4829 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
4830 .addReg(PhiReg)
4831 .add(*Val)
4832 .addReg(SGPRIdxReg)
4833 .addImm(SubReg);
4834 } else {
4835 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
4836 TRI.getRegSizeInBits(*VecRC), 32, false);
4837 BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst)
4838 .addReg(PhiReg)
4839 .add(*Val)
4840 .addImm(SubReg);
4841 }
4842
4843 MI.eraseFromParent();
4844 return LoopBB;
4845}
4846
4849 const GCNSubtarget &ST,
4850 unsigned Opc) {
4852 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4853 const DebugLoc &DL = MI.getDebugLoc();
4854 const SIInstrInfo *TII = ST.getInstrInfo();
4855
4856 // Reduction operations depend on whether the input operand is SGPR or VGPR.
4857 Register SrcReg = MI.getOperand(1).getReg();
4858 bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg));
4859 Register DstReg = MI.getOperand(0).getReg();
4860 MachineBasicBlock *RetBB = nullptr;
4861 if (isSGPR) {
4862 // These operations with a uniform value i.e. SGPR are idempotent.
4863 // Reduced value will be same as given sgpr.
4864 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
4865 RetBB = &BB;
4866 } else {
4867 // TODO: Implement DPP Strategy and switch based on immediate strategy
4868 // operand. For now, for all the cases (default, Iterative and DPP we use
4869 // iterative approach by default.)
4870
4871 // To reduce the VGPR using iterative approach, we need to iterate
4872 // over all the active lanes. Lowering consists of ComputeLoop,
4873 // which iterate over only active lanes. We use copy of EXEC register
4874 // as induction variable and every active lane modifies it using bitset0
4875 // so that we will get the next active lane for next iteration.
4877 Register SrcReg = MI.getOperand(1).getReg();
4878
4879 // Create Control flow for loop
4880 // Split MI's Machine Basic block into For loop
4881 auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);
4882
4883 // Create virtual registers required for lowering.
4884 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
4885 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
4886 Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
4887 Register InitalValReg = MRI.createVirtualRegister(DstRegClass);
4888
4889 Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
4890 Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
4891 Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
4892
4893 Register FF1Reg = MRI.createVirtualRegister(DstRegClass);
4894 Register LaneValueReg = MRI.createVirtualRegister(DstRegClass);
4895
4896 bool IsWave32 = ST.isWave32();
4897 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4898 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4899
4900 // Create initail values of induction variable from Exec, Accumulator and
4901 // insert branch instr to newly created ComputeBlockk
4902 uint32_t InitalValue =
4903 (Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
4904 auto TmpSReg =
4905 BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
4906 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
4907 .addImm(InitalValue);
4908 BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH)).addMBB(ComputeLoop);
4909
4910 // Start constructing ComputeLoop
4911 I = ComputeLoop->end();
4912 auto Accumulator =
4913 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
4914 .addReg(InitalValReg)
4915 .addMBB(&BB);
4916 auto ActiveBits =
4917 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
4918 .addReg(TmpSReg->getOperand(0).getReg())
4919 .addMBB(&BB);
4920
4921 // Perform the computations
4922 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
4923 auto FF1 = BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
4924 .addReg(ActiveBits->getOperand(0).getReg());
4925 auto LaneValue = BuildMI(*ComputeLoop, I, DL,
4926 TII->get(AMDGPU::V_READLANE_B32), LaneValueReg)
4927 .addReg(SrcReg)
4928 .addReg(FF1->getOperand(0).getReg());
4929 auto NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
4930 .addReg(Accumulator->getOperand(0).getReg())
4931 .addReg(LaneValue->getOperand(0).getReg());
4932
4933 // Manipulate the iterator to get the next active lane
4934 unsigned BITSETOpc =
4935 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
4936 auto NewActiveBits =
4937 BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
4938 .addReg(FF1->getOperand(0).getReg())
4939 .addReg(ActiveBits->getOperand(0).getReg());
4940
4941 // Add phi nodes
4942 Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
4943 .addMBB(ComputeLoop);
4944 ActiveBits.addReg(NewActiveBits->getOperand(0).getReg())
4945 .addMBB(ComputeLoop);
4946
4947 // Creating branching
4948 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
4949 BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
4950 .addReg(NewActiveBits->getOperand(0).getReg())
4951 .addImm(0);
4952 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
4953 .addMBB(ComputeLoop);
4954
4955 RetBB = ComputeEnd;
4956 }
4957 MI.eraseFromParent();
4958 return RetBB;
4959}
4960
4962 MachineInstr &MI, MachineBasicBlock *BB) const {
4963
4965 MachineFunction *MF = BB->getParent();
4967
4968 switch (MI.getOpcode()) {
4969 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
4970 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
4971 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
4972 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
4973 case AMDGPU::S_UADDO_PSEUDO:
4974 case AMDGPU::S_USUBO_PSEUDO: {
4975 const DebugLoc &DL = MI.getDebugLoc();
4976 MachineOperand &Dest0 = MI.getOperand(0);
4977 MachineOperand &Dest1 = MI.getOperand(1);
4978 MachineOperand &Src0 = MI.getOperand(2);
4979 MachineOperand &Src1 = MI.getOperand(3);
4980
4981 unsigned Opc = (MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
4982 ? AMDGPU::S_ADD_I32
4983 : AMDGPU::S_SUB_I32;
4984 BuildMI(*BB, MI, DL, TII->get(Opc), Dest0.getReg()).add(Src0).add(Src1);
4985
4986 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B64), Dest1.getReg())
4987 .addImm(1)
4988 .addImm(0);
4989
4990 MI.eraseFromParent();
4991 return BB;
4992 }
4993 case AMDGPU::S_ADD_U64_PSEUDO:
4994 case AMDGPU::S_SUB_U64_PSEUDO: {
4995 // For targets older than GFX12, we emit a sequence of 32-bit operations.
4996 // For GFX12, we emit s_add_u64 and s_sub_u64.
4997 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4999 const DebugLoc &DL = MI.getDebugLoc();
5000 MachineOperand &Dest = MI.getOperand(0);
5001 MachineOperand &Src0 = MI.getOperand(1);
5002 MachineOperand &Src1 = MI.getOperand(2);
5003 bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5004 if (Subtarget->hasScalarAddSub64()) {
5005 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5006 BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
5007 .add(Src0)
5008 .add(Src1);
5009 } else {
5010 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5011 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5012
5013 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5014 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5015
5016 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5017 MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5018 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5019 MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5020
5021 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5022 MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5023 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5024 MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5025
5026 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5027 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5028 BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
5029 .add(Src0Sub0)
5030 .add(Src1Sub0);
5031 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
5032 .add(Src0Sub1)
5033 .add(Src1Sub1);
5034 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5035 .addReg(DestSub0)
5036 .addImm(AMDGPU::sub0)
5037 .addReg(DestSub1)
5038 .addImm(AMDGPU::sub1);
5039 }
5040 MI.eraseFromParent();
5041 return BB;
5042 }
5043 case AMDGPU::V_ADD_U64_PSEUDO:
5044 case AMDGPU::V_SUB_U64_PSEUDO: {
5046 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5047 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5048 const DebugLoc &DL = MI.getDebugLoc();
5049
5050 bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
5051
5052 MachineOperand &Dest = MI.getOperand(0);
5053 MachineOperand &Src0 = MI.getOperand(1);
5054 MachineOperand &Src1 = MI.getOperand(2);
5055
5056 if (IsAdd && ST.hasLshlAddB64()) {
5057 auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64),
5058 Dest.getReg())
5059 .add(Src0)
5060 .addImm(0)
5061 .add(Src1);
5062 TII->legalizeOperands(*Add);
5063 MI.eraseFromParent();
5064 return BB;
5065 }
5066
5067 const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
5068
5069 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5070 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5071
5072 Register CarryReg = MRI.createVirtualRegister(CarryRC);
5073 Register DeadCarryReg = MRI.createVirtualRegister(CarryRC);
5074
5075 const TargetRegisterClass *Src0RC = Src0.isReg()
5076 ? MRI.getRegClass(Src0.getReg())
5077 : &AMDGPU::VReg_64RegClass;
5078 const TargetRegisterClass *Src1RC = Src1.isReg()
5079 ? MRI.getRegClass(Src1.getReg())
5080 : &AMDGPU::VReg_64RegClass;
5081
5082 const TargetRegisterClass *Src0SubRC =
5083 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5084 const TargetRegisterClass *Src1SubRC =
5085 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5086
5087 MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
5088 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5089 MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm(
5090 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5091
5092 MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
5093 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5094 MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm(
5095 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5096
5097 unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
5098 MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
5099 .addReg(CarryReg, RegState::Define)
5100 .add(SrcReg0Sub0)
5101 .add(SrcReg1Sub0)
5102 .addImm(0); // clamp bit
5103
5104 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
5105 MachineInstr *HiHalf =
5106 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
5107 .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
5108 .add(SrcReg0Sub1)
5109 .add(SrcReg1Sub1)
5110 .addReg(CarryReg, RegState::Kill)
5111 .addImm(0); // clamp bit
5112
5113 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5114 .addReg(DestSub0)
5115 .addImm(AMDGPU::sub0)
5116 .addReg(DestSub1)
5117 .addImm(AMDGPU::sub1);
5118 TII->legalizeOperands(*LoHalf);
5119 TII->legalizeOperands(*HiHalf);
5120 MI.eraseFromParent();
5121 return BB;
5122 }
5123 case AMDGPU::S_ADD_CO_PSEUDO:
5124 case AMDGPU::S_SUB_CO_PSEUDO: {
5125 // This pseudo has a chance to be selected
5126 // only from uniform add/subcarry node. All the VGPR operands
5127 // therefore assumed to be splat vectors.
5129 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5130 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5132 const DebugLoc &DL = MI.getDebugLoc();
5133 MachineOperand &Dest = MI.getOperand(0);
5134 MachineOperand &CarryDest = MI.getOperand(1);
5135 MachineOperand &Src0 = MI.getOperand(2);
5136 MachineOperand &Src1 = MI.getOperand(3);
5137 MachineOperand &Src2 = MI.getOperand(4);
5138 unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
5139 ? AMDGPU::S_ADDC_U32
5140 : AMDGPU::S_SUBB_U32;
5141 if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) {
5142 Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5143 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
5144 .addReg(Src0.getReg());
5145 Src0.setReg(RegOp0);
5146 }
5147 if (Src1.isReg() && TRI->isVectorRegister(MRI, Src1.getReg())) {
5148 Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5149 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
5150 .addReg(Src1.getReg());
5151 Src1.setReg(RegOp1);
5152 }
5153 Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5154 if (TRI->isVectorRegister(MRI, Src2.getReg())) {
5155 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
5156 .addReg(Src2.getReg());
5157 Src2.setReg(RegOp2);
5158 }
5159
5160 const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg());
5161 unsigned WaveSize = TRI->getRegSizeInBits(*Src2RC);
5162 assert(WaveSize == 64 || WaveSize == 32);
5163
5164 if (WaveSize == 64) {
5165 if (ST.hasScalarCompareEq64()) {
5166 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))
5167 .addReg(Src2.getReg())
5168 .addImm(0);
5169 } else {
5170 const TargetRegisterClass *SubRC =
5171 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
5172 MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm(
5173 MII, MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
5174 MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm(
5175 MII, MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
5176 Register Src2_32 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5177
5178 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_OR_B32), Src2_32)
5179 .add(Src2Sub0)
5180 .add(Src2Sub1);
5181
5182 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5183 .addReg(Src2_32, RegState::Kill)
5184 .addImm(0);
5185 }
5186 } else {
5187 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5188 .addReg(Src2.getReg())
5189 .addImm(0);
5190 }
5191
5192 BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg()).add(Src0).add(Src1);
5193
5194 unsigned SelOpc =
5195 (WaveSize == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
5196
5197 BuildMI(*BB, MII, DL, TII->get(SelOpc), CarryDest.getReg())
5198 .addImm(-1)
5199 .addImm(0);
5200
5201 MI.eraseFromParent();
5202 return BB;
5203 }
5204 case AMDGPU::SI_INIT_M0: {
5205 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
5206 TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
5207 .add(MI.getOperand(0));
5208 MI.eraseFromParent();
5209 return BB;
5210 }
5211 case AMDGPU::GET_GROUPSTATICSIZE: {
5212 assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
5213 getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
5214 DebugLoc DL = MI.getDebugLoc();
5215 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
5216 .add(MI.getOperand(0))
5217 .addImm(MFI->getLDSSize());
5218 MI.eraseFromParent();
5219 return BB;
5220 }
5221 case AMDGPU::GET_SHADERCYCLESHILO: {
5224 const DebugLoc &DL = MI.getDebugLoc();
5225 // The algorithm is:
5226 //
5227 // hi1 = getreg(SHADER_CYCLES_HI)
5228 // lo1 = getreg(SHADER_CYCLES_LO)
5229 // hi2 = getreg(SHADER_CYCLES_HI)
5230 //
5231 // If hi1 == hi2 then there was no overflow and the result is hi2:lo1.
5232 // Otherwise there was overflow and the result is hi2:0. In both cases the
5233 // result should represent the actual time at some point during the sequence
5234 // of three getregs.
5235 using namespace AMDGPU::Hwreg;
5236 Register RegHi1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5237 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi1)
5238 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5239 Register RegLo1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5240 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegLo1)
5241 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
5242 Register RegHi2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5243 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi2)
5244 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5245 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
5246 .addReg(RegHi1)
5247 .addReg(RegHi2);
5248 Register RegLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5249 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), RegLo)
5250 .addReg(RegLo1)
5251 .addImm(0);
5252 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE))
5253 .add(MI.getOperand(0))
5254 .addReg(RegLo)
5255 .addImm(AMDGPU::sub0)
5256 .addReg(RegHi2)
5257 .addImm(AMDGPU::sub1);
5258 MI.eraseFromParent();
5259 return BB;
5260 }
5261 case AMDGPU::SI_INDIRECT_SRC_V1:
5262 case AMDGPU::SI_INDIRECT_SRC_V2:
5263 case AMDGPU::SI_INDIRECT_SRC_V4:
5264 case AMDGPU::SI_INDIRECT_SRC_V8:
5265 case AMDGPU::SI_INDIRECT_SRC_V9:
5266 case AMDGPU::SI_INDIRECT_SRC_V10:
5267 case AMDGPU::SI_INDIRECT_SRC_V11:
5268 case AMDGPU::SI_INDIRECT_SRC_V12:
5269 case AMDGPU::SI_INDIRECT_SRC_V16:
5270 case AMDGPU::SI_INDIRECT_SRC_V32:
5271 return emitIndirectSrc(MI, *BB, *getSubtarget());
5272 case AMDGPU::SI_INDIRECT_DST_V1:
5273 case AMDGPU::SI_INDIRECT_DST_V2:
5274 case AMDGPU::SI_INDIRECT_DST_V4:
5275 case AMDGPU::SI_INDIRECT_DST_V8:
5276 case AMDGPU::SI_INDIRECT_DST_V9:
5277 case AMDGPU::SI_INDIRECT_DST_V10:
5278 case AMDGPU::SI_INDIRECT_DST_V11:
5279 case AMDGPU::SI_INDIRECT_DST_V12:
5280 case AMDGPU::SI_INDIRECT_DST_V16:
5281 case AMDGPU::SI_INDIRECT_DST_V32:
5282 return emitIndirectDst(MI, *BB, *getSubtarget());
5283 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
5284 case AMDGPU::SI_KILL_I1_PSEUDO:
5285 return splitKillBlock(MI, BB);
5286 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
5288 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5289 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5290
5291 Register Dst = MI.getOperand(0).getReg();
5292 const MachineOperand &Src0 = MI.getOperand(1);
5293 const MachineOperand &Src1 = MI.getOperand(2);
5294 const DebugLoc &DL = MI.getDebugLoc();
5295 Register SrcCond = MI.getOperand(3).getReg();
5296
5297 Register DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5298 Register DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5299 const auto *CondRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
5300 Register SrcCondCopy = MRI.createVirtualRegister(CondRC);
5301
5302 const TargetRegisterClass *Src0RC = Src0.isReg()
5303 ? MRI.getRegClass(Src0.getReg())
5304 : &AMDGPU::VReg_64RegClass;
5305 const TargetRegisterClass *Src1RC = Src1.isReg()
5306 ? MRI.getRegClass(Src1.getReg())
5307 : &AMDGPU::VReg_64RegClass;
5308
5309 const TargetRegisterClass *Src0SubRC =
5310 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5311 const TargetRegisterClass *Src1SubRC =
5312 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5313
5314 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5315 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5316 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5317 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5318
5319 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5320 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5321 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5322 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5323
5324 BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy)
5325 .addReg(SrcCond);
5326 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
5327 .addImm(0)
5328 .add(Src0Sub0)
5329 .addImm(0)
5330 .add(Src1Sub0)
5331 .addReg(SrcCondCopy);
5332 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
5333 .addImm(0)
5334 .add(Src0Sub1)
5335 .addImm(0)
5336 .add(Src1Sub1)
5337 .addReg(SrcCondCopy);
5338
5339 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
5340 .addReg(DstLo)
5341 .addImm(AMDGPU::sub0)
5342 .addReg(DstHi)
5343 .addImm(AMDGPU::sub1);
5344 MI.eraseFromParent();
5345 return BB;
5346 }
5347 case AMDGPU::SI_BR_UNDEF: {
5349 const DebugLoc &DL = MI.getDebugLoc();
5350 MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5351 .add(MI.getOperand(0));
5352 Br->getOperand(1).setIsUndef(); // read undef SCC
5353 MI.eraseFromParent();
5354 return BB;
5355 }
5356 case AMDGPU::ADJCALLSTACKUP:
5357 case AMDGPU::ADJCALLSTACKDOWN: {
5359 MachineInstrBuilder MIB(*MF, &MI);
5360 MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
5361 .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
5362 return BB;
5363 }
5364 case AMDGPU::SI_CALL_ISEL: {
5366 const DebugLoc &DL = MI.getDebugLoc();
5367
5368 unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
5369
5371 MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
5372
5373 for (const MachineOperand &MO : MI.operands())
5374 MIB.add(MO);
5375
5376 MIB.cloneMemRefs(MI);
5377 MI.eraseFromParent();
5378 return BB;
5379 }
5380 case AMDGPU::V_ADD_CO_U32_e32:
5381 case AMDGPU::V_SUB_CO_U32_e32:
5382 case AMDGPU::V_SUBREV_CO_U32_e32: {
5383 // TODO: Define distinct V_*_I32_Pseudo instructions instead.
5384 const DebugLoc &DL = MI.getDebugLoc();
5385 unsigned Opc = MI.getOpcode();
5386
5387 bool NeedClampOperand = false;
5388 if (TII->pseudoToMCOpcode(Opc) == -1) {
5389 Opc = AMDGPU::getVOPe64(Opc);
5390 NeedClampOperand = true;
5391 }
5392
5393 auto I = BuildMI(*BB, MI, DL, TII->get(Opc), MI.getOperand(0).getReg());
5394 if (TII->isVOP3(*I)) {
5395 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5396 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5397 I.addReg(TRI->getVCC(), RegState::Define);
5398 }
5399 I.add(MI.getOperand(1))
5400 .add(MI.getOperand(2));
5401 if (NeedClampOperand)
5402 I.addImm(0); // clamp bit for e64 encoding
5403
5404 TII->legalizeOperands(*I);
5405
5406 MI.eraseFromParent();
5407 return BB;
5408 }
5409 case AMDGPU::V_ADDC_U32_e32:
5410 case AMDGPU::V_SUBB_U32_e32:
5411 case AMDGPU::V_SUBBREV_U32_e32:
5412 // These instructions have an implicit use of vcc which counts towards the
5413 // constant bus limit.
5414 TII->legalizeOperands(MI);
5415 return BB;
5416 case AMDGPU::DS_GWS_INIT:
5417 case AMDGPU::DS_GWS_SEMA_BR:
5418 case AMDGPU::DS_GWS_BARRIER:
5419 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::data0);
5420 [[fallthrough]];
5421 case AMDGPU::DS_GWS_SEMA_V:
5422 case AMDGPU::DS_GWS_SEMA_P:
5423 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
5424 // A s_waitcnt 0 is required to be the instruction immediately following.
5425 if (getSubtarget()->hasGWSAutoReplay()) {
5427 return BB;
5428 }
5429
5430 return emitGWSMemViolTestLoop(MI, BB);
5431 case AMDGPU::S_SETREG_B32: {
5432 // Try to optimize cases that only set the denormal mode or rounding mode.
5433 //
5434 // If the s_setreg_b32 fully sets all of the bits in the rounding mode or
5435 // denormal mode to a constant, we can use s_round_mode or s_denorm_mode
5436 // instead.
5437 //
5438 // FIXME: This could be predicates on the immediate, but tablegen doesn't
5439 // allow you to have a no side effect instruction in the output of a
5440 // sideeffecting pattern.
5441 auto [ID, Offset, Width] =
5442 AMDGPU::Hwreg::HwregEncoding::decode(MI.getOperand(1).getImm());
5444 return BB;
5445
5446 const unsigned WidthMask = maskTrailingOnes<unsigned>(Width);
5447 const unsigned SetMask = WidthMask << Offset;
5448
5449 if (getSubtarget()->hasDenormModeInst()) {
5450 unsigned SetDenormOp = 0;
5451 unsigned SetRoundOp = 0;
5452
5453 // The dedicated instructions can only set the whole denorm or round mode
5454 // at once, not a subset of bits in either.
5455 if (SetMask ==
5457 // If this fully sets both the round and denorm mode, emit the two
5458 // dedicated instructions for these.
5459 SetRoundOp = AMDGPU::S_ROUND_MODE;
5460 SetDenormOp = AMDGPU::S_DENORM_MODE;
5461 } else if (SetMask == AMDGPU::Hwreg::FP_ROUND_MASK) {
5462 SetRoundOp = AMDGPU::S_ROUND_MODE;
5463 } else if (SetMask == AMDGPU::Hwreg::FP_DENORM_MASK) {
5464 SetDenormOp = AMDGPU::S_DENORM_MODE;
5465 }
5466
5467 if (SetRoundOp || SetDenormOp) {
5469 MachineInstr *Def = MRI.getVRegDef(MI.getOperand(0).getReg());
5470 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
5471 unsigned ImmVal = Def->getOperand(1).getImm();
5472 if (SetRoundOp) {
5473 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetRoundOp))
5474 .addImm(ImmVal & 0xf);
5475
5476 // If we also have the denorm mode, get just the denorm mode bits.
5477 ImmVal >>= 4;
5478 }
5479
5480 if (SetDenormOp) {
5481 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetDenormOp))
5482 .addImm(ImmVal & 0xf);
5483 }
5484
5485 MI.eraseFromParent();
5486 return BB;
5487 }
5488 }
5489 }
5490
5491 // If only FP bits are touched, used the no side effects pseudo.
5492 if ((SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK |
5493 AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask)
5494 MI.setDesc(TII->get(AMDGPU::S_SETREG_B32_mode));
5495
5496 return BB;
5497 }
5498 case AMDGPU::S_INVERSE_BALLOT_U32:
5499 case AMDGPU::S_INVERSE_BALLOT_U64:
5500 // These opcodes only exist to let SIFixSGPRCopies insert a readfirstlane if
5501 // necessary. After that they are equivalent to a COPY.
5502 MI.setDesc(TII->get(AMDGPU::COPY));
5503 return BB;
5504 case AMDGPU::ENDPGM_TRAP: {
5505 const DebugLoc &DL = MI.getDebugLoc();
5506 if (BB->succ_empty() && std::next(MI.getIterator()) == BB->end()) {
5507 MI.setDesc(TII->get(AMDGPU::S_ENDPGM));
5508 MI.addOperand(MachineOperand::CreateImm(0));
5509 return BB;
5510 }
5511
5512 // We need a block split to make the real endpgm a terminator. We also don't
5513 // want to break phis in successor blocks, so we can't just delete to the
5514 // end of the block.
5515
5516 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
5518 MF->push_back(TrapBB);
5519 BuildMI(*TrapBB, TrapBB->end(), DL, TII->get(AMDGPU::S_ENDPGM))
5520 .addImm(0);
5521 BuildMI(*BB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
5522 .addMBB(TrapBB);
5523
5524 BB->addSuccessor(TrapBB);
5525 MI.eraseFromParent();
5526 return SplitBB;
5527 }
5528 case AMDGPU::SIMULATED_TRAP: {
5529 assert(Subtarget->hasPrivEnabledTrap2NopBug());
5531 MachineBasicBlock *SplitBB =
5532 TII->insertSimulatedTrap(MRI, *BB, MI, MI.getDebugLoc());
5533 MI.eraseFromParent();
5534 return SplitBB;
5535 }
5536 default:
5537 if (TII->isImage(MI) || TII->isMUBUF(MI)) {
5538 if (!MI.mayStore())
5540 return BB;
5541 }
5543 }
5544}
5545
5547 // This currently forces unfolding various combinations of fsub into fma with
5548 // free fneg'd operands. As long as we have fast FMA (controlled by
5549 // isFMAFasterThanFMulAndFAdd), we should perform these.
5550
5551 // When fma is quarter rate, for f64 where add / sub are at best half rate,
5552 // most of these combines appear to be cycle neutral but save on instruction
5553 // count / code size.
5554 return true;
5555}
5556
5558
5560 EVT VT) const {
5561 if (!VT.isVector()) {
5562 return MVT::i1;
5563 }
5564 return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
5565}
5566
5568 // TODO: Should i16 be used always if legal? For now it would force VALU
5569 // shifts.
5570 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
5571}
5572
5574 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
5575 ? Ty.changeElementSize(16)
5576 : Ty.changeElementSize(32);
5577}
5578
5579// Answering this is somewhat tricky and depends on the specific device which
5580// have different rates for fma or all f64 operations.
5581//
5582// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
5583// regardless of which device (although the number of cycles differs between
5584// devices), so it is always profitable for f64.
5585//
5586// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
5587// only on full rate devices. Normally, we should prefer selecting v_mad_f32
5588// which we can always do even without fused FP ops since it returns the same
5589// result as the separate operations and since it is always full
5590// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
5591// however does not support denormals, so we do report fma as faster if we have
5592// a fast fma device and require denormals.
5593//
5595 EVT VT) const {
5596 VT = VT.getScalarType();
5597
5598 switch (VT.getSimpleVT().SimpleTy) {
5599 case MVT::f32: {
5600 // If mad is not available this depends only on if f32 fma is full rate.
5601 if (!Subtarget->hasMadMacF32Insts())
5602 return Subtarget->hasFastFMAF32();
5603
5604 // Otherwise f32 mad is always full rate and returns the same result as
5605 // the separate operations so should be preferred over fma.
5606 // However does not support denormals.
5608 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
5609
5610 // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
5611 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
5612 }
5613 case MVT::f64:
5614 return true;
5615 case MVT::f16:
5616 return Subtarget->has16BitInsts() && !denormalModeIsFlushAllF64F16(MF);
5617 default:
5618 break;
5619 }
5620
5621 return false;
5622}
5623
5625 LLT Ty) const {
5626 switch (Ty.getScalarSizeInBits()) {
5627 case 16:
5628 return isFMAFasterThanFMulAndFAdd(MF, MVT::f16);
5629 case 32:
5630 return isFMAFasterThanFMulAndFAdd(MF, MVT::f32);
5631 case 64:
5632 return isFMAFasterThanFMulAndFAdd(MF, MVT::f64);
5633 default:
5634 break;
5635 }
5636
5637 return false;
5638}
5639
5641 if (!Ty.isScalar())
5642 return false;
5643
5644 if (Ty.getScalarSizeInBits() == 16)
5645 return Subtarget->hasMadF16() && denormalModeIsFlushAllF64F16(*MI.getMF());
5646 if (Ty.getScalarSizeInBits() == 32)
5647 return Subtarget->hasMadMacF32Insts() &&
5648 denormalModeIsFlushAllF32(*MI.getMF());
5649
5650 return false;
5651}
5652
5654 const SDNode *N) const {
5655 // TODO: Check future ftz flag
5656 // v_mad_f32/v_mac_f32 do not support denormals.
5657 EVT VT = N->getValueType(0);
5658 if (VT == MVT::f32)
5659 return Subtarget->hasMadMacF32Insts() &&
5661 if (VT == MVT::f16) {
5662 return Subtarget->hasMadF16() &&
5664 }
5665
5666 return false;
5667}
5668
5669//===----------------------------------------------------------------------===//
5670// Custom DAG Lowering Operations
5671//===----------------------------------------------------------------------===//
5672
5673// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
5674// wider vector type is legal.
5676 SelectionDAG &DAG) const {
5677 unsigned Opc = Op.getOpcode();
5678 EVT VT = Op.getValueType();
5679 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
5680 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
5681 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5682 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
5683
5684 SDValue Lo, Hi;
5685 std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
5686
5687 SDLoc SL(Op);
5688 SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo,
5689 Op->getFlags());
5690 SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi,
5691 Op->getFlags());
5692
5693 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
5694}
5695
5696// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
5697// wider vector type is legal.
5699 SelectionDAG &DAG) const {
5700 unsigned Opc = Op.getOpcode();
5701 EVT VT = Op.getValueType();
5702 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
5703 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
5704 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5705 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
5706
5707 SDValue Lo0, Hi0;
5708 std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0);
5709 SDValue Lo1, Hi1;
5710 std::tie(Lo1, Hi1) = DAG.SplitVectorOperand(Op.getNode(), 1);
5711
5712 SDLoc SL(Op);
5713
5714 SDValue OpLo = DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1,
5715 Op->getFlags());
5716 SDValue OpHi = DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1,
5717 Op->getFlags());
5718
5719 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
5720}
5721
5723 SelectionDAG &DAG) const {
5724 unsigned Opc = Op.getOpcode();
5725 EVT VT = Op.getValueType();
5726 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
5727 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
5728 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5729 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
5730 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
5731 VT == MVT::v32bf16);
5732
5733 SDValue Lo0, Hi0;
5734 SDValue Op0 = Op.getOperand(0);
5735 std::tie(Lo0, Hi0) = Op0.getValueType().isVector()
5736 ? DAG.SplitVectorOperand(Op.getNode(), 0)
5737 : std::pair(Op0, Op0);
5738 SDValue Lo1, Hi1;
5739 std::tie(Lo1, Hi1) = DAG.SplitVectorOperand(Op.getNode(), 1);
5740 SDValue Lo2, Hi2;
5741 std::tie(Lo2, Hi2) = DAG.SplitVectorOperand(Op.getNode(), 2);
5742
5743 SDLoc SL(Op);
5744 auto ResVT = DAG.GetSplitDestVTs(VT);
5745
5746 SDValue OpLo = DAG.getNode(Opc, SL, ResVT.first, Lo0, Lo1, Lo2,
5747 Op->getFlags());
5748 SDValue OpHi = DAG.getNode(Opc, SL, ResVT.second, Hi0, Hi1, Hi2,
5749 Op->getFlags());
5750
5751 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
5752}
5753
5754
5756 switch (Op.getOpcode()) {
5757 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
5758 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
5759 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
5760 case ISD::LOAD: {
5761 SDValue Result = LowerLOAD(Op, DAG);
5762 assert((!Result.getNode() ||
5763 Result.getNode()->getNumValues() == 2) &&
5764 "Load should return a value and a chain");
5765 return Result;
5766 }
5767 case ISD::FSQRT: {
5768 EVT VT = Op.getValueType();
5769 if (VT == MVT::f32)
5770 return lowerFSQRTF32(Op, DAG);
5771 if (VT == MVT::f64)
5772 return lowerFSQRTF64(Op, DAG);
5773 return SDValue();
5774 }
5775 case ISD::FSIN:
5776 case ISD::FCOS:
5777 return LowerTrig(Op, DAG);
5778 case ISD::SELECT: return LowerSELECT(Op, DAG);
5779 case ISD::FDIV: return LowerFDIV(Op, DAG);
5780 case ISD::FFREXP: return LowerFFREXP(Op, DAG);
5781 case ISD::ATOMIC_CMP_SWAP: return LowerATOMIC_CMP_SWAP(Op, DAG);
5782 case ISD::STORE: return LowerSTORE(Op, DAG);
5783 case ISD::GlobalAddress: {
5786 return LowerGlobalAddress(MFI, Op, DAG);
5787 }
5788 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
5789 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG);
5790 case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG);
5791 case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG);
5793 return lowerINSERT_SUBVECTOR(Op, DAG);
5795 return lowerINSERT_VECTOR_ELT(Op, DAG);
5797 return lowerEXTRACT_VECTOR_ELT(Op, DAG);
5799 return lowerVECTOR_SHUFFLE(Op, DAG);
5801 return lowerSCALAR_TO_VECTOR(Op, DAG);
5802 case ISD::BUILD_VECTOR:
5803 return lowerBUILD_VECTOR(Op, DAG);
5804 case ISD::FP_ROUND:
5806 return lowerFP_ROUND(Op, DAG);
5807 case ISD::FPTRUNC_ROUND:
5808 return lowerFPTRUNC_ROUND(Op, DAG);
5809 case ISD::TRAP:
5810 return lowerTRAP(Op, DAG);
5811 case ISD::DEBUGTRAP:
5812 return lowerDEBUGTRAP(Op, DAG);
5813 case ISD::ABS:
5814 case ISD::FABS:
5815 case ISD::FNEG:
5816 case ISD::FCANONICALIZE:
5817 case ISD::BSWAP:
5818 return splitUnaryVectorOp(Op, DAG);
5819 case ISD::FMINNUM:
5820 case ISD::FMAXNUM:
5821 return lowerFMINNUM_FMAXNUM(Op, DAG);
5822 case ISD::FLDEXP:
5823 case ISD::STRICT_FLDEXP:
5824 return lowerFLDEXP(Op, DAG);
5825 case ISD::FMA:
5826 return splitTernaryVectorOp(Op, DAG);
5827 case ISD::FP_TO_SINT:
5828 case ISD::FP_TO_UINT:
5829 return LowerFP_TO_INT(Op, DAG);
5830 case ISD::SHL:
5831 case ISD::SRA:
5832 case ISD::SRL:
5833 case ISD::ADD:
5834 case ISD::SUB:
5835 case ISD::SMIN:
5836 case ISD::SMAX:
5837 case ISD::UMIN:
5838 case ISD::UMAX:
5839 case ISD::FADD:
5840 case ISD::FMUL:
5841 case ISD::FMINNUM_IEEE:
5842 case ISD::FMAXNUM_IEEE:
5843 case ISD::FMINIMUM:
5844 case ISD::FMAXIMUM:
5845 case ISD::UADDSAT:
5846 case ISD::USUBSAT:
5847 case ISD::SADDSAT:
5848 case ISD::SSUBSAT:
5849 return splitBinaryVectorOp(Op, DAG);
5850 case ISD::MUL:
5851 return lowerMUL(Op, DAG);
5852 case ISD::SMULO:
5853 case ISD::UMULO:
5854 return lowerXMULO(Op, DAG);
5855 case ISD::SMUL_LOHI:
5856 case ISD::UMUL_LOHI:
5857 return lowerXMUL_LOHI(Op, DAG);
5859 return LowerDYNAMIC_STACKALLOC(Op, DAG);
5860 case ISD::STACKSAVE:
5861 return LowerSTACKSAVE(Op, DAG);
5862 case ISD::GET_ROUNDING:
5863 return lowerGET_ROUNDING(Op, DAG);
5864 case ISD::SET_ROUNDING:
5865 return lowerSET_ROUNDING(Op, DAG);
5866 case ISD::PREFETCH:
5867 return lowerPREFETCH(Op, DAG);
5868 case ISD::FP_EXTEND:
5870 return lowerFP_EXTEND(Op, DAG);
5871 case ISD::GET_FPENV:
5872 return lowerGET_FPENV(Op, DAG);
5873 case ISD::SET_FPENV:
5874 return lowerSET_FPENV(Op, DAG);
5875 }
5876 return SDValue();
5877}
5878
5879// Used for D16: Casts the result of an instruction into the right vector,
5880// packs values if loads return unpacked values.
5882 const SDLoc &DL,
5883 SelectionDAG &DAG, bool Unpacked) {
5884 if (!LoadVT.isVector())
5885 return Result;
5886
5887 // Cast back to the original packed type or to a larger type that is a
5888 // multiple of 32 bit for D16. Widening the return type is a required for
5889 // legalization.
5890 EVT FittingLoadVT = LoadVT;
5891 if ((LoadVT.getVectorNumElements() % 2) == 1) {
5892 FittingLoadVT =
5894 LoadVT.getVectorNumElements() + 1);
5895 }
5896
5897 if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
5898 // Truncate to v2i16/v4i16.
5899 EVT IntLoadVT = FittingLoadVT.changeTypeToInteger();
5900
5901 // Workaround legalizer not scalarizing truncate after vector op
5902 // legalization but not creating intermediate vector trunc.
5904 DAG.ExtractVectorElements(Result, Elts);
5905 for (SDValue &Elt : Elts)
5906 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
5907
5908 // Pad illegal v1i16/v3fi6 to v4i16
5909 if ((LoadVT.getVectorNumElements() % 2) == 1)
5910 Elts.push_back(DAG.getUNDEF(MVT::i16));
5911
5912 Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
5913
5914 // Bitcast to original type (v2f16/v4f16).
5915 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
5916 }
5917
5918 // Cast back to the original packed type.
5919 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
5920}
5921
5922SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode,
5923 MemSDNode *M,
5924 SelectionDAG &DAG,
5926 bool IsIntrinsic) const {
5927 SDLoc DL(M);
5928
5929 bool Unpacked = Subtarget->hasUnpackedD16VMem();
5930 EVT LoadVT = M->getValueType(0);
5931
5932 EVT EquivLoadVT = LoadVT;
5933 if (LoadVT.isVector()) {
5934 if (Unpacked) {
5935 EquivLoadVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
5936 LoadVT.getVectorNumElements());
5937 } else if ((LoadVT.getVectorNumElements() % 2) == 1) {
5938 // Widen v3f16 to legal type
5939 EquivLoadVT =
5941 LoadVT.getVectorNumElements() + 1);
5942 }
5943 }
5944
5945 // Change from v4f16/v2f16 to EquivLoadVT.
5946 SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
5947
5949 = DAG.getMemIntrinsicNode(
5950 IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL,
5951 VTList, Ops, M->getMemoryVT(),
5952 M->getMemOperand());
5953
5954 SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
5955
5956 return DAG.getMergeValues({ Adjusted, Load.getValue(1) }, DL);
5957}
5958
5959SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat,
5960 SelectionDAG &DAG,
5961 ArrayRef<SDValue> Ops) const {
5962 SDLoc DL(M);
5963 EVT LoadVT = M->getValueType(0);
5964 EVT EltType = LoadVT.getScalarType();
5965 EVT IntVT = LoadVT.changeTypeToInteger();
5966
5967 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
5968
5969 assert(M->getNumValues() == 2 || M->getNumValues() == 3);
5970 bool IsTFE = M->getNumValues() == 3;
5971
5972 unsigned Opc = IsFormat ? (IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE
5976
5977 if (IsD16) {
5978 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops);
5979 }
5980
5981 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
5982 if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32)
5983 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M->getMemOperand(),
5984 IsTFE);
5985
5986 if (isTypeLegal(LoadVT)) {
5987 return getMemIntrinsicNode(Opc, DL, M->getVTList(), Ops, IntVT,
5988 M->getMemOperand(), DAG);
5989 }
5990
5991 EVT CastVT = getEquivalentMemType(*DAG.getContext(), LoadVT);
5992 SDVTList VTList = DAG.getVTList(CastVT, MVT::Other);
5993 SDValue MemNode = getMemIntrinsicNode(Opc, DL, VTList, Ops, CastVT,
5994 M->getMemOperand(), DAG);
5995 return DAG.getMergeValues(
5996 {DAG.getNode(ISD::BITCAST, DL, LoadVT, MemNode), MemNode.getValue(1)},
5997 DL);
5998}
5999
6001 SDNode *N, SelectionDAG &DAG) {
6002 EVT VT = N->getValueType(0);
6003 unsigned CondCode = N->getConstantOperandVal(3);
6004 if (!ICmpInst::isIntPredicate(static_cast<ICmpInst::Predicate>(CondCode)))
6005 return DAG.getUNDEF(VT);
6006
6007 ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
6008
6009 SDValue LHS = N->getOperand(1);
6010 SDValue RHS = N->getOperand(2);
6011
6012 SDLoc DL(N);
6013
6014 EVT CmpVT = LHS.getValueType();
6015 if (CmpVT == MVT::i16 && !TLI.isTypeLegal(MVT::i16)) {
6016 unsigned PromoteOp = ICmpInst::isSigned(IcInput) ?
6018 LHS = DAG.getNode(PromoteOp, DL, MVT::i32, LHS);
6019 RHS = DAG.getNode(PromoteOp, DL, MVT::i32, RHS);
6020 }
6021
6022 ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
6023
6024 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
6025 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
6026
6027 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, DL, CCVT, LHS, RHS,
6028 DAG.getCondCode(CCOpcode));
6029 if (VT.bitsEq(CCVT))
6030 return SetCC;
6031 return DAG.getZExtOrTrunc(SetCC, DL, VT);
6032}
6033
6035 SDNode *N, SelectionDAG &DAG) {
6036 EVT VT = N->getValueType(0);
6037
6038 unsigned CondCode = N->getConstantOperandVal(3);
6039 if (!FCmpInst::isFPPredicate(static_cast<FCmpInst::Predicate>(CondCode)))
6040 return DAG.getUNDEF(VT);
6041
6042 SDValue Src0 = N->getOperand(1);
6043 SDValue Src1 = N->getOperand(2);
6044 EVT CmpVT = Src0.getValueType();
6045 SDLoc SL(N);
6046
6047 if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) {
6048 Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
6049 Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
6050 }
6051
6052 FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
6053 ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
6054 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
6055 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
6056 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, SL, CCVT, Src0,
6057 Src1, DAG.getCondCode(CCOpcode));
6058 if (VT.bitsEq(CCVT))
6059 return SetCC;
6060 return DAG.getZExtOrTrunc(SetCC, SL, VT);
6061}
6062
6064 SelectionDAG &DAG) {
6065 EVT VT = N->getValueType(0);
6066 SDValue Src = N->getOperand(1);
6067 SDLoc SL(N);
6068
6069 if (Src.getOpcode() == ISD::SETCC) {
6070 // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...)
6071 return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src.getOperand(0),
6072 Src.getOperand(1), Src.getOperand(2));
6073 }
6074 if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Src)) {
6075 // (ballot 0) -> 0
6076 if (Arg->isZero())
6077 return DAG.getConstant(0, SL, VT);
6078
6079 // (ballot 1) -> EXEC/EXEC_LO
6080 if (Arg->isOne()) {
6081 Register Exec;
6082 if (VT.getScalarSizeInBits() == 32)
6083 Exec = AMDGPU::EXEC_LO;
6084 else if (VT.getScalarSizeInBits() == 64)
6085 Exec = AMDGPU::EXEC;
6086 else
6087 return SDValue();
6088
6089 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, Exec, VT);
6090 }
6091 }
6092
6093 // (ballot (i1 $src)) -> (AMDGPUISD::SETCC (i32 (zext $src)) (i32 0)
6094 // ISD::SETNE)
6095 return DAG.getNode(
6096 AMDGPUISD::SETCC, SL, VT, DAG.getZExtOrTrunc(Src, SL, MVT::i32),
6097 DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE));
6098}
6099
6101 SelectionDAG &DAG) {
6102 EVT VT = N->getValueType(0);
6103 unsigned ValSize = VT.getSizeInBits();
6104 unsigned IID = N->getConstantOperandVal(0);
6105 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
6106 IID == Intrinsic::amdgcn_permlanex16;
6107 SDLoc SL(N);
6108 MVT IntVT = MVT::getIntegerVT(ValSize);
6109
6110 auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1,
6111 SDValue Src2, MVT ValT) -> SDValue {
6113 switch (IID) {
6114 case Intrinsic::amdgcn_permlane16:
6115 case Intrinsic::amdgcn_permlanex16:
6116 Operands.push_back(N->getOperand(6));
6117 Operands.push_back(N->getOperand(5));
6118 Operands.push_back(N->getOperand(4));
6119 [[fallthrough]];
6120 case Intrinsic::amdgcn_writelane:
6121 Operands.push_back(Src2);
6122 [[fallthrough]];
6123 case Intrinsic::amdgcn_readlane:
6124 Operands.push_back(Src1);
6125 [[fallthrough]];
6126 case Intrinsic::amdgcn_readfirstlane:
6127 case Intrinsic::amdgcn_permlane64:
6128 Operands.push_back(Src0);
6129 break;
6130 default:
6131 llvm_unreachable("unhandled lane op");
6132 }
6133
6134 Operands.push_back(DAG.getTargetConstant(IID, SL, MVT::i32));
6135 std::reverse(Operands.begin(), Operands.end());
6136
6137 if (SDNode *GL = N->getGluedNode()) {
6138 assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
6139 GL = GL->getOperand(0).getNode();
6140 Operands.push_back(DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
6141 SDValue(GL, 0)));
6142 }
6143
6144 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, ValT, Operands);
6145 };
6146
6147 SDValue Src0 = N->getOperand(1);
6148 SDValue Src1, Src2;
6149 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
6150 IsPermLane16) {
6151 Src1 = N->getOperand(2);
6152 if (IID == Intrinsic::amdgcn_writelane || IsPermLane16)
6153 Src2 = N->getOperand(3);
6154 }
6155
6156 if (ValSize == 32) {
6157 // Already legal
6158 return SDValue();
6159 }
6160
6161 if (ValSize < 32) {
6162 bool IsFloat = VT.isFloatingPoint();
6163 Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0,
6164 SL, MVT::i32);
6165
6166 if (IsPermLane16) {
6167 Src1 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src1) : Src1,
6168 SL, MVT::i32);
6169 }
6170
6171 if (IID == Intrinsic::amdgcn_writelane) {
6172 Src2 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src2) : Src2,
6173 SL, MVT::i32);
6174 }
6175
6176 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
6177 SDValue Trunc = DAG.getAnyExtOrTrunc(LaneOp, SL, IntVT);
6178 return IsFloat ? DAG.getBitcast(VT, Trunc) : Trunc;
6179 }
6180
6181 if (ValSize % 32 != 0)
6182 return SDValue();
6183
6184 auto unrollLaneOp = [&DAG, &SL](SDNode *N) -> SDValue {
6185 EVT VT = N->getValueType(0);
6186 unsigned NE = VT.getVectorNumElements();
6187 EVT EltVT = VT.getVectorElementType();
6189 unsigned NumOperands = N->getNumOperands();
6190 SmallVector<SDValue, 4> Operands(NumOperands);
6191 SDNode *GL = N->getGluedNode();
6192
6193 // only handle convergencectrl_glue
6195
6196 for (unsigned i = 0; i != NE; ++i) {
6197 for (unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
6198 ++j) {
6199 SDValue Operand = N->getOperand(j);
6200 EVT OperandVT = Operand.getValueType();
6201 if (OperandVT.isVector()) {
6202 // A vector operand; extract a single element.
6203 EVT OperandEltVT = OperandVT.getVectorElementType();
6204 Operands[j] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, OperandEltVT,
6205 Operand, DAG.getVectorIdxConstant(i, SL));
6206 } else {
6207 // A scalar operand; just use it as is.
6208 Operands[j] = Operand;
6209 }
6210 }
6211
6212 if (GL)
6213 Operands[NumOperands - 1] =
6214 DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
6215 SDValue(GL->getOperand(0).getNode(), 0));
6216
6217 Scalars.push_back(DAG.getNode(N->getOpcode(), SL, EltVT, Operands));
6218 }
6219
6220 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NE);
6221 return DAG.getBuildVector(VecVT, SL, Scalars);
6222 };
6223
6224 if (VT.isVector()) {
6225 switch (MVT::SimpleValueType EltTy =
6227 case MVT::i32:
6228 case MVT::f32: {
6229 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VT.getSimpleVT());
6230 return unrollLaneOp(LaneOp.getNode());
6231 }
6232 case MVT::i16:
6233 case MVT::f16:
6234 case MVT::bf16: {
6235 MVT SubVecVT = MVT::getVectorVT(EltTy, 2);
6237 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
6238 for (unsigned i = 0, EltIdx = 0; i < ValSize / 32; i++) {
6239 Src0SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0,
6240 DAG.getConstant(EltIdx, SL, MVT::i32));
6241
6242 if (IsPermLane16)
6243 Src1SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src1,
6244 DAG.getConstant(EltIdx, SL, MVT::i32));
6245
6246 if (IID == Intrinsic::amdgcn_writelane)
6247 Src2SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src2,
6248 DAG.getConstant(EltIdx, SL, MVT::i32));
6249
6250 Pieces.push_back(
6251 IsPermLane16
6252 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
6253 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
6254 EltIdx += 2;
6255 }
6256 return DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, Pieces);
6257 }
6258 default:
6259 // Handle all other cases by bitcasting to i32 vectors
6260 break;
6261 }
6262 }
6263
6264 MVT VecVT = MVT::getVectorVT(MVT::i32, ValSize / 32);
6265 Src0 = DAG.getBitcast(VecVT, Src0);
6266
6267 if (IsPermLane16)
6268 Src1 = DAG.getBitcast(VecVT, Src1);
6269
6270 if (IID == Intrinsic::amdgcn_writelane)
6271 Src2 = DAG.getBitcast(VecVT, Src2);
6272
6273 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
6274 SDValue UnrolledLaneOp = unrollLaneOp(LaneOp.getNode());
6275 return DAG.getBitcast(VT, UnrolledLaneOp);
6276}
6277
6280 SelectionDAG &DAG) const {
6281 switch (N->getOpcode()) {
6283 if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
6284 Results.push_back(Res);
6285 return;
6286 }
6288 if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
6289 Results.push_back(Res);
6290 return;
6291 }
6293 unsigned IID = N->getConstantOperandVal(0);
6294 switch (IID) {
6295 case Intrinsic::amdgcn_make_buffer_rsrc:
6296 Results.push_back(lowerPointerAsRsrcIntrin(N, DAG));
6297 return;
6298 case Intrinsic::amdgcn_cvt_pkrtz: {
6299 SDValue Src0 = N->getOperand(1);
6300 SDValue Src1 = N->getOperand(2);
6301 SDLoc SL(N);
6302 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32,
6303 Src0, Src1);
6304 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
6305 return;
6306 }
6307 case Intrinsic::amdgcn_cvt_pknorm_i16:
6308 case Intrinsic::amdgcn_cvt_pknorm_u16:
6309 case Intrinsic::amdgcn_cvt_pk_i16:
6310 case Intrinsic::amdgcn_cvt_pk_u16: {
6311 SDValue Src0 = N->getOperand(1);
6312 SDValue Src1 = N->getOperand(2);
6313 SDLoc SL(N);
6314 unsigned Opcode;
6315
6316 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
6318 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
6320 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
6322 else
6324
6325 EVT VT = N->getValueType(0);
6326 if (isTypeLegal(VT))
6327 Results.push_back(DAG.getNode(Opcode, SL, VT, Src0, Src1));
6328 else {
6329 SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
6330 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
6331 }
6332 return;
6333 }
6334 case Intrinsic::amdgcn_s_buffer_load: {
6335 // Lower llvm.amdgcn.s.buffer.load.(i8, u8) intrinsics. First, we generate
6336 // s_buffer_load_u8 for signed and unsigned load instructions. Next, DAG
6337 // combiner tries to merge the s_buffer_load_u8 with a sext instruction
6338 // (performSignExtendInRegCombine()) and it replaces s_buffer_load_u8 with
6339 // s_buffer_load_i8.
6340 if (!Subtarget->hasScalarSubwordLoads())
6341 return;
6342 SDValue Op = SDValue(N, 0);
6343 SDValue Rsrc = Op.getOperand(1);
6344 SDValue Offset = Op.getOperand(2);
6345 SDValue CachePolicy = Op.getOperand(3);
6346 EVT VT = Op.getValueType();
6347 assert(VT == MVT::i8 && "Expected 8-bit s_buffer_load intrinsics.\n");
6348 SDLoc DL(Op);
6350 const DataLayout &DataLayout = DAG.getDataLayout();
6351 Align Alignment =
6357 VT.getStoreSize(), Alignment);
6358 SDValue LoadVal;
6359 if (!Offset->isDivergent()) {
6360 SDValue Ops[] = {Rsrc, // source register
6361 Offset, CachePolicy};
6362 SDValue BufferLoad =
6364 DAG.getVTList(MVT::i32), Ops, VT, MMO);
6365 LoadVal = DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
6366 } else {
6367 SDValue Ops[] = {
6368 DAG.getEntryNode(), // Chain
6369 Rsrc, // rsrc
6370 DAG.getConstant(0, DL, MVT::i32), // vindex
6371 {}, // voffset
6372 {}, // soffset
6373 {}, // offset
6374 CachePolicy, // cachepolicy
6375 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
6376 };
6377 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
6378 LoadVal = handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
6379 }
6380 Results.push_back(LoadVal);
6381 return;
6382 }
6383 }
6384 break;
6385 }
6387 if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
6388 if (Res.getOpcode() == ISD::MERGE_VALUES) {
6389 // FIXME: Hacky
6390 for (unsigned I = 0; I < Res.getNumOperands(); I++) {
6391 Results.push_back(Res.getOperand(I));
6392 }
6393 } else {
6394 Results.push_back(Res);
6395 Results.push_back(Res.getValue(1));
6396 }
6397 return;
6398 }
6399
6400 break;
6401 }
6402 case ISD::SELECT: {
6403 SDLoc SL(N);
6404 EVT VT = N->getValueType(0);
6405 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
6406 SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
6407 SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
6408
6409 EVT SelectVT = NewVT;
6410 if (NewVT.bitsLT(MVT::i32)) {
6411 LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
6412 RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
6413 SelectVT = MVT::i32;
6414 }
6415
6416 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, SelectVT,
6417 N->getOperand(0), LHS, RHS);
6418
6419 if (NewVT != SelectVT)
6420 NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect);
6421 Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
6422 return;
6423 }
6424 case ISD::FNEG: {
6425 if (N->getValueType(0) != MVT::v2f16)
6426 break;
6427
6428 SDLoc SL(N);
6429 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
6430
6431 SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32,
6432 BC,
6433 DAG.getConstant(0x80008000, SL, MVT::i32));
6434 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
6435 return;
6436 }
6437 case ISD::FABS: {
6438 if (N->getValueType(0) != MVT::v2f16)
6439 break;
6440
6441 SDLoc SL(N);
6442 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
6443
6444 SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32,
6445 BC,
6446 DAG.getConstant(0x7fff7fff, SL, MVT::i32));
6447 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
6448 return;
6449 }
6450 case ISD::FSQRT: {
6451 if (N->getValueType(0) != MVT::f16)
6452 break;
6453 Results.push_back(lowerFSQRTF16(SDValue(N, 0), DAG));
6454 break;
6455 }
6456 default:
6458 break;
6459 }
6460}
6461
6462/// Helper function for LowerBRCOND
6463static SDNode *findUser(SDValue Value, unsigned Opcode) {
6464
6465 SDNode *Parent = Value.getNode();
6466 for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end();
6467 I != E; ++I) {
6468
6469 if (I.getUse().get() != Value)
6470 continue;
6471
6472 if (I->getOpcode() == Opcode)
6473 return *I;
6474 }
6475 return nullptr;
6476}
6477
6478unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
6479 if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
6480 switch (Intr->getConstantOperandVal(1)) {
6481 case Intrinsic::amdgcn_if:
6482 return AMDGPUISD::IF;
6483 case Intrinsic::amdgcn_else:
6484 return AMDGPUISD::ELSE;
6485 case Intrinsic::amdgcn_loop:
6486 return AMDGPUISD::LOOP;
6487 case Intrinsic::amdgcn_end_cf:
6488 llvm_unreachable("should not occur");
6489 default:
6490 return 0;
6491 }
6492 }
6493
6494 // break, if_break, else_break are all only used as inputs to loop, not
6495 // directly as branch conditions.
6496 return 0;
6497}
6498
6500 const Triple &TT = getTargetMachine().getTargetTriple();
6504}
6505
6507 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
6508 return false;
6509
6510 // FIXME: Either avoid relying on address space here or change the default
6511 // address space for functions to avoid the explicit check.
6512 return (GV->getValueType()->isFunctionTy() ||
6515}
6516
6518 return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
6519}
6520
6522 if (!GV->hasExternalLinkage())
6523 return true;
6524
6525 const auto OS = getTargetMachine().getTargetTriple().getOS();
6526 return OS == Triple::AMDHSA || OS == Triple::AMDPAL;
6527}
6528
6529/// This transforms the control flow intrinsics to get the branch destination as
6530/// last parameter, also switches branch target with BR if the need arise
6531SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
6532 SelectionDAG &DAG) const {
6533 SDLoc DL(BRCOND);
6534
6535 SDNode *Intr = BRCOND.getOperand(1).getNode();
6536 SDValue Target = BRCOND.getOperand(2);
6537 SDNode *BR = nullptr;
6538 SDNode *SetCC = nullptr;
6539
6540 if (Intr->getOpcode() == ISD::SETCC) {
6541 // As long as we negate the condition everything is fine
6542 SetCC = Intr;
6543 Intr = SetCC->getOperand(0).getNode();
6544
6545 } else {
6546 // Get the target from BR if we don't negate the condition
6547 BR = findUser(BRCOND, ISD::BR);
6548 assert(BR && "brcond missing unconditional branch user");
6549 Target = BR->getOperand(1);
6550 }
6551
6552 unsigned CFNode = isCFIntrinsic(Intr);
6553 if (CFNode == 0) {
6554 // This is a uniform branch so we don't need to legalize.
6555 return BRCOND;
6556 }
6557
6558 bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
6559 Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN;
6560
6561 assert(!SetCC ||
6562 (SetCC->getConstantOperandVal(1) == 1 &&
6563 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
6564 ISD::SETNE));
6565
6566 // operands of the new intrinsic call
6568 if (HaveChain)
6569 Ops.push_back(BRCOND.getOperand(0));
6570
6571 Ops.append(Intr->op_begin() + (HaveChain ? 2 : 1), Intr->op_end());
6572 Ops.push_back(Target);
6573
6574 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
6575
6576 // build the new intrinsic call
6577 SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
6578
6579 if (!HaveChain) {
6580 SDValue Ops[] = {
6581 SDValue(Result, 0),
6582 BRCOND.getOperand(0)
6583 };
6584
6585 Result = DAG.getMergeValues(Ops, DL).getNode();
6586 }
6587
6588 if (BR) {
6589 // Give the branch instruction our target
6590 SDValue Ops[] = {
6591 BR->getOperand(0),
6592 BRCOND.getOperand(2)
6593 };
6594 SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
6595 DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
6596 }
6597
6598 SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
6599
6600 // Copy the intrinsic results to registers
6601 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
6603 if (!CopyToReg)
6604 continue;
6605
6606 Chain = DAG.getCopyToReg(
6607 Chain, DL,
6608 CopyToReg->getOperand(1),
6609 SDValue(Result, i - 1),
6610 SDValue());
6611
6612 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
6613 }
6614
6615 // Remove the old intrinsic from the chain
6617 SDValue(Intr, Intr->getNumValues() - 1),
6618 Intr->getOperand(0));
6619
6620 return Chain;
6621}
6622
6623SDValue SITargetLowering::LowerRETURNADDR(SDValue Op,
6624 SelectionDAG &DAG) const {
6625 MVT VT = Op.getSimpleValueType();
6626 SDLoc DL(Op);
6627 // Checking the depth
6628 if (Op.getConstantOperandVal(0) != 0)
6629 return DAG.getConstant(0, DL, VT);
6630
6633 // Check for kernel and shader functions
6634 if (Info->isEntryFunction())
6635 return DAG.getConstant(0, DL, VT);
6636
6637 MachineFrameInfo &MFI = MF.getFrameInfo();
6638 // There is a call to @llvm.returnaddress in this function
6639 MFI.setReturnAddressIsTaken(true);
6640
6642 // Get the return address reg and mark it as an implicit live-in
6643 Register Reg = MF.addLiveIn(TRI->getReturnAddressReg(MF), getRegClassFor(VT, Op.getNode()->isDivergent()));
6644
6645 return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
6646}
6647
6648SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG,
6649 SDValue Op,
6650 const SDLoc &DL,
6651 EVT VT) const {
6652 return Op.getValueType().bitsLE(VT) ?
6653 DAG.getNode(ISD::FP_EXTEND, DL, VT, Op) :
6654 DAG.getNode(ISD::FP_ROUND, DL, VT, Op,
6655 DAG.getTargetConstant(0, DL, MVT::i32));
6656}
6657
6658SDValue SITargetLowering::lowerFPTRUNC_ROUND(SDValue Op,
6659 SelectionDAG &DAG) const {
6660 if (Op.getOperand(0)->getValueType(0) != MVT::f32)
6661 return SDValue();
6662
6663 // Only support towardzero, tonearest, upward and downward.
6664 int RoundMode = Op.getConstantOperandVal(1);
6665 if (RoundMode != (int)RoundingMode::TowardZero &&
6666 RoundMode != (int)RoundingMode::NearestTiesToEven &&
6667 RoundMode != (int)RoundingMode::TowardPositive &&
6668 RoundMode != (int)RoundingMode::TowardNegative)
6669 return SDValue();
6670
6671 // "round.towardzero" -> TowardZero 0 -> FP_ROUND_ROUND_TO_ZERO 3
6672 // "round.tonearest" -> NearestTiesToEven 1 -> FP_ROUND_ROUND_TO_NEAREST 0
6673 // "round.upward" -> TowardPositive 2 -> FP_ROUND_ROUND_TO_INF 1
6674 // "round.downward -> TowardNegative 3 -> FP_ROUND_ROUND_TO_NEGINF 2
6675 unsigned HW_Mode = (RoundMode + 3) % 4;
6676 SDLoc DL(Op);
6677 SDValue RoundFlag = DAG.getTargetConstant(HW_Mode, DL, MVT::i32);
6678 return DAG.getNode(AMDGPUISD::FPTRUNC_ROUND, DL, Op.getNode()->getVTList(),
6679 Op->getOperand(0), RoundFlag);
6680}
6681
6682SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
6683 assert(Op.getValueType() == MVT::f16 &&
6684 "Do not know how to custom lower FP_ROUND for non-f16 type");
6685
6686 SDValue Src = Op.getOperand(0);
6687 EVT SrcVT = Src.getValueType();
6688 if (SrcVT != MVT::f64)
6689 return Op;
6690
6691 // TODO: Handle strictfp
6692 if (Op.getOpcode() != ISD::FP_ROUND)
6693 return Op;
6694
6695 SDLoc DL(Op);
6696
6697 SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
6698 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
6699 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
6700}
6701
6702SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
6703 SelectionDAG &DAG) const {
6704 EVT VT = Op.getValueType();
6705 const MachineFunction &MF = DAG.getMachineFunction();
6707 bool IsIEEEMode = Info->getMode().IEEE;
6708
6709 // FIXME: Assert during selection that this is only selected for
6710 // ieee_mode. Currently a combine can produce the ieee version for non-ieee
6711 // mode functions, but this happens to be OK since it's only done in cases
6712 // where there is known no sNaN.
6713 if (IsIEEEMode)
6714 return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
6715
6716 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
6717 VT == MVT::v16bf16)
6718 return splitBinaryVectorOp(Op, DAG);
6719 return Op;
6720}
6721
6722SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const {
6723 bool IsStrict = Op.getOpcode() == ISD::STRICT_FLDEXP;
6724 EVT VT = Op.getValueType();
6725 assert(VT == MVT::f16);
6726
6727 SDValue Exp = Op.getOperand(IsStrict ? 2 : 1);
6728 EVT ExpVT = Exp.getValueType();
6729 if (ExpVT == MVT::i16)
6730 return Op;
6731
6732 SDLoc DL(Op);
6733
6734 // Correct the exponent type for f16 to i16.
6735 // Clamp the range of the exponent to the instruction's range.
6736
6737 // TODO: This should be a generic narrowing legalization, and can easily be
6738 // for GlobalISel.
6739
6740 SDValue MinExp = DAG.getConstant(minIntN(16), DL, ExpVT);
6741 SDValue ClampMin = DAG.getNode(ISD::SMAX, DL, ExpVT, Exp, MinExp);
6742
6743 SDValue MaxExp = DAG.getConstant(maxIntN(16), DL, ExpVT);
6744 SDValue Clamp = DAG.getNode(ISD::SMIN, DL, ExpVT, ClampMin, MaxExp);
6745
6746 SDValue TruncExp = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Clamp);
6747
6748 if (IsStrict) {
6749 return DAG.getNode(ISD::STRICT_FLDEXP, DL, {VT, MVT::Other},
6750 {Op.getOperand(0), Op.getOperand(1), TruncExp});
6751 }
6752
6753 return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(0), TruncExp);
6754}
6755
6756// Custom lowering for vector multiplications and s_mul_u64.
6757SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const {
6758 EVT VT = Op.getValueType();
6759
6760 // Split vector operands.
6761 if (VT.isVector())
6762 return splitBinaryVectorOp(Op, DAG);
6763
6764 assert(VT == MVT::i64 && "The following code is a special for s_mul_u64");
6765
6766 // There are four ways to lower s_mul_u64:
6767 //
6768 // 1. If all the operands are uniform, then we lower it as it is.
6769 //
6770 // 2. If the operands are divergent, then we have to split s_mul_u64 in 32-bit
6771 // multiplications because there is not a vector equivalent of s_mul_u64.
6772 //
6773 // 3. If the cost model decides that it is more efficient to use vector
6774 // registers, then we have to split s_mul_u64 in 32-bit multiplications.
6775 // This happens in splitScalarSMULU64() in SIInstrInfo.cpp .
6776 //
6777 // 4. If the cost model decides to use vector registers and both of the
6778 // operands are zero-extended/sign-extended from 32-bits, then we split the
6779 // s_mul_u64 in two 32-bit multiplications. The problem is that it is not
6780 // possible to check if the operands are zero-extended or sign-extended in
6781 // SIInstrInfo.cpp. For this reason, here, we replace s_mul_u64 with
6782 // s_mul_u64_u32_pseudo if both operands are zero-extended and we replace
6783 // s_mul_u64 with s_mul_i64_i32_pseudo if both operands are sign-extended.
6784 // If the cost model decides that we have to use vector registers, then
6785 // splitScalarSMulPseudo() (in SIInstrInfo.cpp) split s_mul_u64_u32/
6786 // s_mul_i64_i32_pseudo in two vector multiplications. If the cost model
6787 // decides that we should use scalar registers, then s_mul_u64_u32_pseudo/
6788 // s_mul_i64_i32_pseudo is lowered as s_mul_u64 in expandPostRAPseudo() in
6789 // SIInstrInfo.cpp .
6790
6791 if (Op->isDivergent())
6792 return SDValue();
6793
6794 SDValue Op0 = Op.getOperand(0);
6795 SDValue Op1 = Op.getOperand(1);
6796 // If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64
6797 // with s_mul_u64_u32_pseudo. If all the operands are sign-extended to
6798 // 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo.
6799 KnownBits Op0KnownBits = DAG.computeKnownBits(Op0);
6800 unsigned Op0LeadingZeros = Op0KnownBits.countMinLeadingZeros();
6801 KnownBits Op1KnownBits = DAG.computeKnownBits(Op1);
6802 unsigned Op1LeadingZeros = Op1KnownBits.countMinLeadingZeros();
6803 SDLoc SL(Op);
6804 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
6805 return SDValue(
6806 DAG.getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
6807 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op0);
6808 unsigned Op1SignBits = DAG.ComputeNumSignBits(Op1);
6809 if (Op0SignBits >= 33 && Op1SignBits >= 33)
6810 return SDValue(
6811 DAG.getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
6812 // If all the operands are uniform, then we lower s_mul_u64 as it is.
6813 return Op;
6814}
6815
6816SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const {
6817 EVT VT = Op.getValueType();
6818 SDLoc SL(Op);
6819 SDValue LHS = Op.getOperand(0);
6820 SDValue RHS = Op.getOperand(1);
6821 bool isSigned = Op.getOpcode() == ISD::SMULO;
6822
6823 if (ConstantSDNode *RHSC = isConstOrConstSplat(RHS)) {
6824 const APInt &C = RHSC->getAPIntValue();
6825 // mulo(X, 1 << S) -> { X << S, (X << S) >> S != X }
6826 if (C.isPowerOf2()) {
6827 // smulo(x, signed_min) is same as umulo(x, signed_min).
6828 bool UseArithShift = isSigned && !C.isMinSignedValue();
6829 SDValue ShiftAmt = DAG.getConstant(C.logBase2(), SL, MVT::i32);
6830 SDValue Result = DAG.getNode(ISD::SHL, SL, VT, LHS, ShiftAmt);
6831 SDValue Overflow = DAG.getSetCC(SL, MVT::i1,
6832 DAG.getNode(UseArithShift ? ISD::SRA : ISD::SRL,
6833 SL, VT, Result, ShiftAmt),
6834 LHS, ISD::SETNE);
6835 return DAG.getMergeValues({ Result, Overflow }, SL);
6836 }
6837 }
6838
6839 SDValue Result = DAG.getNode(ISD::MUL, SL, VT, LHS, RHS);
6841 SL, VT, LHS, RHS);
6842
6843 SDValue Sign = isSigned
6844 ? DAG.getNode(ISD::SRA, SL, VT, Result,
6845 DAG.getConstant(VT.getScalarSizeInBits() - 1, SL, MVT::i32))
6846 : DAG.getConstant(0, SL, VT);
6847 SDValue Overflow = DAG.getSetCC(SL, MVT::i1, Top, Sign, ISD::SETNE);
6848
6849 return DAG.getMergeValues({ Result, Overflow }, SL);
6850}
6851
6852SDValue SITargetLowering::lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const {
6853 if (Op->isDivergent()) {
6854 // Select to V_MAD_[IU]64_[IU]32.
6855 return Op;
6856 }
6857 if (Subtarget->hasSMulHi()) {
6858 // Expand to S_MUL_I32 + S_MUL_HI_[IU]32.
6859 return SDValue();
6860 }
6861 // The multiply is uniform but we would have to use V_MUL_HI_[IU]32 to
6862 // calculate the high part, so we might as well do the whole thing with
6863 // V_MAD_[IU]64_[IU]32.
6864 return Op;
6865}
6866
6867SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
6868 if (!Subtarget->isTrapHandlerEnabled() ||
6870 return lowerTrapEndpgm(Op, DAG);
6871
6872 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(Op, DAG) :
6873 lowerTrapHsaQueuePtr(Op, DAG);
6874}
6875
6876SDValue SITargetLowering::lowerTrapEndpgm(
6877 SDValue Op, SelectionDAG &DAG) const {
6878 SDLoc SL(Op);
6879 SDValue Chain = Op.getOperand(0);
6880 return DAG.getNode(AMDGPUISD::ENDPGM_TRAP, SL, MVT::Other, Chain);
6881}
6882
6883SDValue SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT,
6884 const SDLoc &DL, Align Alignment, ImplicitParameter Param) const {
6887 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), Offset);
6889 return DAG.getLoad(VT, DL, DAG.getEntryNode(), Ptr, PtrInfo, Alignment,
6892}
6893
6894SDValue SITargetLowering::lowerTrapHsaQueuePtr(
6895 SDValue Op, SelectionDAG &DAG) const {
6896 SDLoc SL(Op);
6897 SDValue Chain = Op.getOperand(0);
6898
6899 SDValue QueuePtr;
6900 // For code object version 5, QueuePtr is passed through implicit kernarg.
6901 const Module *M = DAG.getMachineFunction().getFunction().getParent();
6903 QueuePtr =
6904 loadImplicitKernelArgument(DAG, MVT::i64, SL, Align(8), QUEUE_PTR);
6905 } else {
6908 Register UserSGPR = Info->getQueuePtrUserSGPR();
6909
6910 if (UserSGPR == AMDGPU::NoRegister) {
6911 // We probably are in a function incorrectly marked with
6912 // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the
6913 // trap, so just use a null pointer.
6914 QueuePtr = DAG.getConstant(0, SL, MVT::i64);
6915 } else {
6916 QueuePtr = CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR,
6917 MVT::i64);
6918 }
6919 }
6920
6921 SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
6922 SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01,
6923 QueuePtr, SDValue());
6924
6926 SDValue Ops[] = {
6927 ToReg,
6928 DAG.getTargetConstant(TrapID, SL, MVT::i16),
6929 SGPR01,
6930 ToReg.getValue(1)
6931 };
6932 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
6933}
6934
6935SDValue SITargetLowering::lowerTrapHsa(
6936 SDValue Op, SelectionDAG &DAG) const {
6937 SDLoc SL(Op);
6938 SDValue Chain = Op.getOperand(0);
6939
6940 // We need to simulate the 's_trap 2' instruction on targets that run in
6941 // PRIV=1 (where it is treated as a nop).
6942 if (Subtarget->hasPrivEnabledTrap2NopBug())
6943 return DAG.getNode(AMDGPUISD::SIMULATED_TRAP, SL, MVT::Other, Chain);
6944
6946 SDValue Ops[] = {
6947 Chain,
6948 DAG.getTargetConstant(TrapID, SL, MVT::i16)
6949 };
6950 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
6951}
6952
6953SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
6954 SDLoc SL(Op);
6955 SDValue Chain = Op.getOperand(0);
6957
6958 if (!Subtarget->isTrapHandlerEnabled() ||
6961 "debugtrap handler not supported",
6962 Op.getDebugLoc(),
6963 DS_Warning);
6964 LLVMContext &Ctx = MF.getFunction().getContext();
6965 Ctx.diagnose(NoTrap);
6966 return Chain;
6967 }
6968
6970 SDValue Ops[] = {
6971 Chain,
6972 DAG.getTargetConstant(TrapID, SL, MVT::i16)
6973 };
6974 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
6975}
6976
6977SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
6978 SelectionDAG &DAG) const {
6979 if (Subtarget->hasApertureRegs()) {
6980 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
6981 ? AMDGPU::SRC_SHARED_BASE
6982 : AMDGPU::SRC_PRIVATE_BASE;
6983 // Note: this feature (register) is broken. When used as a 32-bit operand,
6984 // it returns a wrong value (all zeroes?). The real value is in the upper 32
6985 // bits.
6986 //
6987 // To work around the issue, directly emit a 64 bit mov from this register
6988 // then extract the high bits. Note that this shouldn't even result in a
6989 // shift being emitted and simply become a pair of registers (e.g.):
6990 // s_mov_b64 s[6:7], src_shared_base
6991 // v_mov_b32_e32 v1, s7
6992 //
6993 // FIXME: It would be more natural to emit a CopyFromReg here, but then copy
6994 // coalescing would kick in and it would think it's okay to use the "HI"
6995 // subregister directly (instead of extracting the HI 32 bits) which is an
6996 // artificial (unusable) register.
6997 // Register TableGen definitions would need an overhaul to get rid of the
6998 // artificial "HI" aperture registers and prevent this kind of issue from
6999 // happening.
7000 SDNode *Mov = DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64,
7001 DAG.getRegister(ApertureRegNo, MVT::i64));
7002 return DAG.getNode(
7003 ISD::TRUNCATE, DL, MVT::i32,
7004 DAG.getNode(ISD::SRL, DL, MVT::i64,
7005 {SDValue(Mov, 0), DAG.getConstant(32, DL, MVT::i64)}));
7006 }
7007
7008 // For code object version 5, private_base and shared_base are passed through
7009 // implicit kernargs.
7010 const Module *M = DAG.getMachineFunction().getFunction().getParent();
7014 return loadImplicitKernelArgument(DAG, MVT::i32, DL, Align(4), Param);
7015 }
7016
7019 Register UserSGPR = Info->getQueuePtrUserSGPR();
7020 if (UserSGPR == AMDGPU::NoRegister) {
7021 // We probably are in a function incorrectly marked with
7022 // amdgpu-no-queue-ptr. This is undefined.
7023 return DAG.getUNDEF(MVT::i32);
7024 }
7025
7026 SDValue QueuePtr = CreateLiveInRegister(
7027 DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
7028
7029 // Offset into amd_queue_t for group_segment_aperture_base_hi /
7030 // private_segment_aperture_base_hi.
7031 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
7032
7033 SDValue Ptr =
7034 DAG.getObjectPtrOffset(DL, QueuePtr, TypeSize::getFixed(StructOffset));
7035
7036 // TODO: Use custom target PseudoSourceValue.
7037 // TODO: We should use the value from the IR intrinsic call, but it might not
7038 // be available and how do we get it?
7040 return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
7041 commonAlignment(Align(64), StructOffset),
7044}
7045
7046/// Return true if the value is a known valid address, such that a null check is
7047/// not necessary.
7049 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
7050 if (isa<FrameIndexSDNode>(Val) || isa<GlobalAddressSDNode>(Val) ||
7051 isa<BasicBlockSDNode>(Val))
7052 return true;
7053
7054 if (auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
7055 return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace);
7056
7057 // TODO: Search through arithmetic, handle arguments and loads
7058 // marked nonnull.
7059 return false;
7060}
7061
7062SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
7063 SelectionDAG &DAG) const {
7064 SDLoc SL(Op);
7065
7066 const AMDGPUTargetMachine &TM =
7067 static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
7068
7069 unsigned DestAS, SrcAS;
7070 SDValue Src;
7071 bool IsNonNull = false;
7072 if (const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(Op)) {
7073 SrcAS = ASC->getSrcAddressSpace();
7074 Src = ASC->getOperand(0);
7075 DestAS = ASC->getDestAddressSpace();
7076 } else {
7077 assert(Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
7078 Op.getConstantOperandVal(0) ==
7079 Intrinsic::amdgcn_addrspacecast_nonnull);
7080 Src = Op->getOperand(1);
7081 SrcAS = Op->getConstantOperandVal(2);
7082 DestAS = Op->getConstantOperandVal(3);
7083 IsNonNull = true;
7084 }
7085
7086 SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
7087
7088 // flat -> local/private
7089 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
7090 if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
7091 DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
7092 SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
7093
7094 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
7095 return Ptr;
7096
7097 unsigned NullVal = TM.getNullPointerValue(DestAS);
7098 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
7099 SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
7100
7101 return DAG.getNode(ISD::SELECT, SL, MVT::i32, NonNull, Ptr,
7102 SegmentNullPtr);
7103 }
7104 }
7105
7106 // local/private -> flat
7107 if (DestAS == AMDGPUAS::FLAT_ADDRESS) {
7108 if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
7109 SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
7110
7111 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
7112 SDValue CvtPtr =
7113 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
7114 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
7115
7116 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
7117 return CvtPtr;
7118
7119 unsigned NullVal = TM.getNullPointerValue(SrcAS);
7120 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
7121
7122 SDValue NonNull
7123 = DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
7124
7125 return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull, CvtPtr,
7126 FlatNullPtr);
7127 }
7128 }
7129
7130 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
7131 Op.getValueType() == MVT::i64) {
7134 SDValue Hi = DAG.getConstant(Info->get32BitAddressHighBits(), SL, MVT::i32);
7135 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Hi);
7136 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
7137 }
7138
7139 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
7140 Src.getValueType() == MVT::i64)
7141 return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
7142
7143 // global <-> flat are no-ops and never emitted.
7144
7145 const MachineFunction &MF = DAG.getMachineFunction();
7146 DiagnosticInfoUnsupported InvalidAddrSpaceCast(
7147 MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc());
7148 DAG.getContext()->diagnose(InvalidAddrSpaceCast);
7149
7150 return DAG.getUNDEF(Op->getValueType(0));
7151}
7152
7153// This lowers an INSERT_SUBVECTOR by extracting the individual elements from
7154// the small vector and inserting them into the big vector. That is better than
7155// the default expansion of doing it via a stack slot. Even though the use of
7156// the stack slot would be optimized away afterwards, the stack slot itself
7157// remains.
7158SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
7159 SelectionDAG &DAG) const {
7160 SDValue Vec = Op.getOperand(0);
7161 SDValue Ins = Op.getOperand(1);
7162 SDValue Idx = Op.getOperand(2);
7163 EVT VecVT = Vec.getValueType();
7164 EVT InsVT = Ins.getValueType();
7165 EVT EltVT = VecVT.getVectorElementType();
7166 unsigned InsNumElts = InsVT.getVectorNumElements();
7167 unsigned IdxVal = Idx->getAsZExtVal();
7168 SDLoc SL(Op);
7169
7170 if (EltVT.getScalarSizeInBits() == 16 && IdxVal % 2 == 0) {
7171 // Insert 32-bit registers at a time.
7172 assert(InsNumElts % 2 == 0 && "expect legal vector types");
7173
7174 unsigned VecNumElts = VecVT.getVectorNumElements();
7175 EVT NewVecVT =
7176 EVT::getVectorVT(*DAG.getContext(), MVT::i32, VecNumElts / 2);
7177 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
7179 MVT::i32, InsNumElts / 2);
7180
7181 Vec = DAG.getNode(ISD::BITCAST, SL, NewVecVT, Vec);
7182 Ins = DAG.getNode(ISD::BITCAST, SL, NewInsVT, Ins);
7183
7184 for (unsigned I = 0; I != InsNumElts / 2; ++I) {
7185 SDValue Elt;
7186 if (InsNumElts == 2) {
7187 Elt = Ins;
7188 } else {
7189 Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Ins,
7190 DAG.getConstant(I, SL, MVT::i32));
7191 }
7192 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, NewVecVT, Vec, Elt,
7193 DAG.getConstant(IdxVal / 2 + I, SL, MVT::i32));
7194 }
7195
7196 return DAG.getNode(ISD::BITCAST, SL, VecVT, Vec);
7197 }
7198
7199 for (unsigned I = 0; I != InsNumElts; ++I) {
7200 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Ins,
7201 DAG.getConstant(I, SL, MVT::i32));
7202 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, VecVT, Vec, Elt,
7203 DAG.getConstant(IdxVal + I, SL, MVT::i32));
7204 }
7205 return Vec;
7206}
7207
7208SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
7209 SelectionDAG &DAG) const {
7210 SDValue Vec = Op.getOperand(0);
7211 SDValue InsVal = Op.getOperand(1);
7212 SDValue Idx = Op.getOperand(2);
7213 EVT VecVT = Vec.getValueType();
7214 EVT EltVT = VecVT.getVectorElementType();
7215 unsigned VecSize = VecVT.getSizeInBits();
7216 unsigned EltSize = EltVT.getSizeInBits();
7217 SDLoc SL(Op);
7218
7219 // Specially handle the case of v4i16 with static indexing.
7220 unsigned NumElts = VecVT.getVectorNumElements();
7221 auto KIdx = dyn_cast<ConstantSDNode>(Idx);
7222 if (NumElts == 4 && EltSize == 16 && KIdx) {
7223 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
7224
7225 SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
7226 DAG.getConstant(0, SL, MVT::i32));
7227 SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
7228 DAG.getConstant(1, SL, MVT::i32));
7229
7230 SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
7231 SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
7232
7233 unsigned Idx = KIdx->getZExtValue();
7234 bool InsertLo = Idx < 2;
7235 SDValue InsHalf = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16,
7236 InsertLo ? LoVec : HiVec,
7237 DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
7238 DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
7239
7240 InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
7241
7242 SDValue Concat = InsertLo ?
7243 DAG.getBuildVector(MVT::v2i32, SL, { InsHalf, HiHalf }) :
7244 DAG.getBuildVector(MVT::v2i32, SL, { LoHalf, InsHalf });
7245
7246 return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
7247 }
7248
7249 // Static indexing does not lower to stack access, and hence there is no need
7250 // for special custom lowering to avoid stack access.
7251 if (isa<ConstantSDNode>(Idx))
7252 return SDValue();
7253
7254 // Avoid stack access for dynamic indexing by custom lowering to
7255 // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
7256
7257 assert(VecSize <= 64 && "Expected target vector size to be <= 64 bits");
7258
7259 MVT IntVT = MVT::getIntegerVT(VecSize);
7260
7261 // Convert vector index to bit-index and get the required bit mask.
7262 assert(isPowerOf2_32(EltSize));
7263 const auto EltMask = maskTrailingOnes<uint64_t>(EltSize);
7264 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
7265 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
7266 SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
7267 DAG.getConstant(EltMask, SL, IntVT), ScaledIdx);
7268
7269 // 1. Create a congruent vector with the target value in each element.
7270 SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT,
7271 DAG.getSplatBuildVector(VecVT, SL, InsVal));
7272
7273 // 2. Mask off all other indices except the required index within (1).
7274 SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
7275
7276 // 3. Mask off the required index within the target vector.
7277 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
7278 SDValue RHS = DAG.getNode(ISD::AND, SL, IntVT,
7279 DAG.getNOT(SL, BFM, IntVT), BCVec);
7280
7281 // 4. Get (2) and (3) ORed into the target vector.
7282 SDValue BFI = DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS);
7283
7284 return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
7285}
7286
7287SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
7288 SelectionDAG &DAG) const {
7289 SDLoc SL(Op);
7290
7291 EVT ResultVT = Op.getValueType();
7292 SDValue Vec = Op.getOperand(0);
7293 SDValue Idx = Op.getOperand(1);
7294 EVT VecVT = Vec.getValueType();
7295 unsigned VecSize = VecVT.getSizeInBits();
7296 EVT EltVT = VecVT.getVectorElementType();
7297
7298 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
7299
7300 // Make sure we do any optimizations that will make it easier to fold
7301 // source modifiers before obscuring it with bit operations.
7302
7303 // XXX - Why doesn't this get called when vector_shuffle is expanded?
7304 if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
7305 return Combined;
7306
7307 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
7308 SDValue Lo, Hi;
7309 EVT LoVT, HiVT;
7310 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VecVT);
7311
7312 if (VecSize == 128) {
7313 SDValue V2 = DAG.getBitcast(MVT::v2i64, Vec);
7314 Lo = DAG.getBitcast(LoVT,
7315 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7316 DAG.getConstant(0, SL, MVT::i32)));
7317 Hi = DAG.getBitcast(HiVT,
7318 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7319 DAG.getConstant(1, SL, MVT::i32)));
7320 } else if (VecSize == 256) {
7321 SDValue V2 = DAG.getBitcast(MVT::v4i64, Vec);
7322 SDValue Parts[4];
7323 for (unsigned P = 0; P < 4; ++P) {
7324 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7325 DAG.getConstant(P, SL, MVT::i32));
7326 }
7327
7328 Lo = DAG.getBitcast(LoVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
7329 Parts[0], Parts[1]));
7330 Hi = DAG.getBitcast(HiVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
7331 Parts[2], Parts[3]));
7332 } else {
7333 assert(VecSize == 512);
7334
7335 SDValue V2 = DAG.getBitcast(MVT::v8i64, Vec);
7336 SDValue Parts[8];
7337 for (unsigned P = 0; P < 8; ++P) {
7338 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7339 DAG.getConstant(P, SL, MVT::i32));
7340 }
7341
7342 Lo = DAG.getBitcast(LoVT,
7343 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
7344 Parts[0], Parts[1], Parts[2], Parts[3]));
7345 Hi = DAG.getBitcast(HiVT,
7346 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
7347 Parts[4], Parts[5],Parts[6], Parts[7]));
7348 }
7349
7350 EVT IdxVT = Idx.getValueType();
7351 unsigned NElem = VecVT.getVectorNumElements();
7352 assert(isPowerOf2_32(NElem));
7353 SDValue IdxMask = DAG.getConstant(NElem / 2 - 1, SL, IdxVT);
7354 SDValue NewIdx = DAG.getNode(ISD::AND, SL, IdxVT, Idx, IdxMask);
7355 SDValue Half = DAG.getSelectCC(SL, Idx, IdxMask, Hi, Lo, ISD::SETUGT);
7356 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Half, NewIdx);
7357 }
7358
7359 assert(VecSize <= 64);
7360
7361 MVT IntVT = MVT::getIntegerVT(VecSize);
7362
7363 // If Vec is just a SCALAR_TO_VECTOR, then use the scalar integer directly.
7364 SDValue VecBC = peekThroughBitcasts(Vec);
7365 if (VecBC.getOpcode() == ISD::SCALAR_TO_VECTOR) {
7366 SDValue Src = VecBC.getOperand(0);
7367 Src = DAG.getBitcast(Src.getValueType().changeTypeToInteger(), Src);
7368 Vec = DAG.getAnyExtOrTrunc(Src, SL, IntVT);
7369 }
7370
7371 unsigned EltSize = EltVT.getSizeInBits();
7372 assert(isPowerOf2_32(EltSize));
7373
7374 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
7375
7376 // Convert vector index to bit-index (* EltSize)
7377 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
7378
7379 SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
7380 SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
7381
7382 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
7383 SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
7384 return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
7385 }
7386
7387 return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
7388}
7389
7390static bool elementPairIsContiguous(ArrayRef<int> Mask, int Elt) {
7391 assert(Elt % 2 == 0);
7392 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
7393}
7394
7395SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
7396 SelectionDAG &DAG) const {
7397 SDLoc SL(Op);
7398 EVT ResultVT = Op.getValueType();
7399 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
7400
7401 EVT PackVT = ResultVT.isInteger() ? MVT::v2i16 : MVT::v2f16;
7402 EVT EltVT = PackVT.getVectorElementType();
7403 int SrcNumElts = Op.getOperand(0).getValueType().getVectorNumElements();
7404
7405 // vector_shuffle <0,1,6,7> lhs, rhs
7406 // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2)
7407 //
7408 // vector_shuffle <6,7,2,3> lhs, rhs
7409 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2)
7410 //
7411 // vector_shuffle <6,7,0,1> lhs, rhs
7412 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0)
7413
7414 // Avoid scalarizing when both halves are reading from consecutive elements.
7416 for (int I = 0, N = ResultVT.getVectorNumElements(); I != N; I += 2) {
7417 if (elementPairIsContiguous(SVN->getMask(), I)) {
7418 const int Idx = SVN->getMaskElt(I);
7419 int VecIdx = Idx < SrcNumElts ? 0 : 1;
7420 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
7421 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL,
7422 PackVT, SVN->getOperand(VecIdx),
7423 DAG.getConstant(EltIdx, SL, MVT::i32));
7424 Pieces.push_back(SubVec);
7425 } else {
7426 const int Idx0 = SVN->getMaskElt(I);
7427 const int Idx1 = SVN->getMaskElt(I + 1);
7428 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
7429 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
7430 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
7431 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
7432
7433 SDValue Vec0 = SVN->getOperand(VecIdx0);
7434 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
7435 Vec0, DAG.getConstant(EltIdx0, SL, MVT::i32));
7436
7437 SDValue Vec1 = SVN->getOperand(VecIdx1);
7438 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
7439 Vec1, DAG.getConstant(EltIdx1, SL, MVT::i32));
7440 Pieces.push_back(DAG.getBuildVector(PackVT, SL, { Elt0, Elt1 }));
7441 }
7442 }
7443
7444 return DAG.getNode(ISD::CONCAT_VECTORS, SL, ResultVT, Pieces);
7445}
7446
7447SDValue SITargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op,
7448 SelectionDAG &DAG) const {
7449 SDValue SVal = Op.getOperand(0);
7450 EVT ResultVT = Op.getValueType();
7451 EVT SValVT = SVal.getValueType();
7452 SDValue UndefVal = DAG.getUNDEF(SValVT);
7453 SDLoc SL(Op);
7454
7456 VElts.push_back(SVal);
7457 for (int I = 1, E = ResultVT.getVectorNumElements(); I < E; ++I)
7458 VElts.push_back(UndefVal);
7459
7460 return DAG.getBuildVector(ResultVT, SL, VElts);
7461}
7462
7463SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
7464 SelectionDAG &DAG) const {
7465 SDLoc SL(Op);
7466 EVT VT = Op.getValueType();
7467
7468 if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
7469 VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
7471 VT.getVectorNumElements() / 2);
7472 MVT HalfIntVT = MVT::getIntegerVT(HalfVT.getSizeInBits());
7473
7474 // Turn into pair of packed build_vectors.
7475 // TODO: Special case for constants that can be materialized with s_mov_b64.
7476 SmallVector<SDValue, 4> LoOps, HiOps;
7477 for (unsigned I = 0, E = VT.getVectorNumElements() / 2; I != E; ++I) {
7478 LoOps.push_back(Op.getOperand(I));
7479 HiOps.push_back(Op.getOperand(I + E));
7480 }
7481 SDValue Lo = DAG.getBuildVector(HalfVT, SL, LoOps);
7482 SDValue Hi = DAG.getBuildVector(HalfVT, SL, HiOps);
7483
7484 SDValue CastLo = DAG.getNode(ISD::BITCAST, SL, HalfIntVT, Lo);
7485 SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, HalfIntVT, Hi);
7486
7487 SDValue Blend = DAG.getBuildVector(MVT::getVectorVT(HalfIntVT, 2), SL,
7488 { CastLo, CastHi });
7489 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
7490 }
7491
7492 if (VT == MVT::v16i16 || VT == MVT::v16f16 || VT == MVT::v16bf16) {
7494 VT.getVectorNumElements() / 4);
7495 MVT QuarterIntVT = MVT::getIntegerVT(QuarterVT.getSizeInBits());
7496
7497 SmallVector<SDValue, 4> Parts[4];
7498 for (unsigned I = 0, E = VT.getVectorNumElements() / 4; I != E; ++I) {
7499 for (unsigned P = 0; P < 4; ++P)
7500 Parts[P].push_back(Op.getOperand(I + P * E));
7501 }
7502 SDValue Casts[4];
7503 for (unsigned P = 0; P < 4; ++P) {
7504 SDValue Vec = DAG.getBuildVector(QuarterVT, SL, Parts[P]);
7505 Casts[P] = DAG.getNode(ISD::BITCAST, SL, QuarterIntVT, Vec);
7506 }
7507
7508 SDValue Blend =
7509 DAG.getBuildVector(MVT::getVectorVT(QuarterIntVT, 4), SL, Casts);
7510 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
7511 }
7512
7513 if (VT == MVT::v32i16 || VT == MVT::v32f16 || VT == MVT::v32bf16) {
7515 VT.getVectorNumElements() / 8);
7516 MVT QuarterIntVT = MVT::getIntegerVT(QuarterVT.getSizeInBits());
7517
7518 SmallVector<SDValue, 8> Parts[8];
7519 for (unsigned I = 0, E = VT.getVectorNumElements() / 8; I != E; ++I) {
7520 for (unsigned P = 0; P < 8; ++P)
7521 Parts[P].push_back(Op.getOperand(I + P * E));
7522 }
7523 SDValue Casts[8];
7524 for (unsigned P = 0; P < 8; ++P) {
7525 SDValue Vec = DAG.getBuildVector(QuarterVT, SL, Parts[P]);
7526 Casts[P] = DAG.getNode(ISD::BITCAST, SL, QuarterIntVT, Vec);
7527 }
7528
7529 SDValue Blend =
7530 DAG.getBuildVector(MVT::getVectorVT(QuarterIntVT, 8), SL, Casts);
7531 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
7532 }
7533
7534 assert(VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16);
7535 assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
7536
7537 SDValue Lo = Op.getOperand(0);
7538 SDValue Hi = Op.getOperand(1);
7539
7540 // Avoid adding defined bits with the zero_extend.
7541 if (Hi.isUndef()) {
7542 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
7543 SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo);
7544 return DAG.getNode(ISD::BITCAST, SL, VT, ExtLo);
7545 }
7546
7547 Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
7548 Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
7549
7550 SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
7551 DAG.getConstant(16, SL, MVT::i32));
7552 if (Lo.isUndef())
7553 return DAG.getNode(ISD::BITCAST, SL, VT, ShlHi);
7554
7555 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
7556 Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
7557
7558 SDValue Or = DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi);
7559 return DAG.getNode(ISD::BITCAST, SL, VT, Or);
7560}
7561
7562bool
7564 // OSes that use ELF REL relocations (instead of RELA) can only store a
7565 // 32-bit addend in the instruction, so it is not safe to allow offset folding
7566 // which can create arbitrary 64-bit addends. (This is only a problem for
7567 // R_AMDGPU_*32_HI relocations since other relocation types are unaffected by
7568 // the high 32 bits of the addend.)
7569 //
7570 // This should be kept in sync with how HasRelocationAddend is initialized in
7571 // the constructor of ELFAMDGPUAsmBackend.
7572 if (!Subtarget->isAmdHsaOS())
7573 return false;
7574
7575 // We can fold offsets for anything that doesn't require a GOT relocation.
7576 return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
7580}
7581
7582static SDValue
7584 const SDLoc &DL, int64_t Offset, EVT PtrVT,
7585 unsigned GAFlags = SIInstrInfo::MO_NONE) {
7586 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
7587 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
7588 // lowered to the following code sequence:
7589 //
7590 // For constant address space:
7591 // s_getpc_b64 s[0:1]
7592 // s_add_u32 s0, s0, $symbol
7593 // s_addc_u32 s1, s1, 0
7594 //
7595 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
7596 // a fixup or relocation is emitted to replace $symbol with a literal
7597 // constant, which is a pc-relative offset from the encoding of the $symbol
7598 // operand to the global variable.
7599 //
7600 // For global address space:
7601 // s_getpc_b64 s[0:1]
7602 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
7603 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
7604 //
7605 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
7606 // fixups or relocations are emitted to replace $symbol@*@lo and
7607 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
7608 // which is a 64-bit pc-relative offset from the encoding of the $symbol
7609 // operand to the global variable.
7610 SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags);
7611 SDValue PtrHi;
7612 if (GAFlags == SIInstrInfo::MO_NONE)
7613 PtrHi = DAG.getTargetConstant(0, DL, MVT::i32);
7614 else
7615 PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags + 1);
7616 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
7617}
7618
7619SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
7620 SDValue Op,
7621 SelectionDAG &DAG) const {
7622 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
7623 SDLoc DL(GSD);
7624 EVT PtrVT = Op.getValueType();
7625
7626 const GlobalValue *GV = GSD->getGlobal();
7632 GV->hasExternalLinkage()) {
7633 Type *Ty = GV->getValueType();
7634 // HIP uses an unsized array `extern __shared__ T s[]` or similar
7635 // zero-sized type in other languages to declare the dynamic shared
7636 // memory which size is not known at the compile time. They will be
7637 // allocated by the runtime and placed directly after the static
7638 // allocated ones. They all share the same offset.
7639 if (DAG.getDataLayout().getTypeAllocSize(Ty).isZero()) {
7640 assert(PtrVT == MVT::i32 && "32-bit pointer is expected.");
7641 // Adjust alignment for that dynamic shared memory array.
7643 MFI->setDynLDSAlign(F, *cast<GlobalVariable>(GV));
7644 MFI->setUsesDynamicLDS(true);
7645 return SDValue(
7646 DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0);
7647 }
7648 }
7650 }
7651
7653 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(),
7655 return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA);
7656 }
7657
7658 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
7659 SDValue AddrLo = DAG.getTargetGlobalAddress(
7660 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_LO);
7661 AddrLo = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrLo), 0};
7662
7663 SDValue AddrHi = DAG.getTargetGlobalAddress(
7664 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_HI);
7665 AddrHi = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrHi), 0};
7666
7667 return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, AddrLo, AddrHi);
7668 }
7669
7670 if (shouldEmitFixup(GV))
7671 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
7672
7673 if (shouldEmitPCReloc(GV))
7674 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
7676
7677 SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
7679
7680 Type *Ty = PtrVT.getTypeForEVT(*DAG.getContext());
7682 const DataLayout &DataLayout = DAG.getDataLayout();
7683 Align Alignment = DataLayout.getABITypeAlign(PtrTy);
7684 MachinePointerInfo PtrInfo
7686
7687 return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Alignment,
7690}
7691
7693 const SDLoc &DL, SDValue V) const {
7694 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
7695 // the destination register.
7696 //
7697 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
7698 // so we will end up with redundant moves to m0.
7699 //
7700 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
7701
7702 // A Null SDValue creates a glue result.
7703 SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
7704 V, Chain);
7705 return SDValue(M0, 0);
7706}
7707
7708SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG,
7709 SDValue Op,
7710 MVT VT,
7711 unsigned Offset) const {
7712 SDLoc SL(Op);
7713 SDValue Param = lowerKernargMemParameter(
7714 DAG, MVT::i32, MVT::i32, SL, DAG.getEntryNode(), Offset, Align(4), false);
7715 // The local size values will have the hi 16-bits as zero.
7716 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
7717 DAG.getValueType(VT));
7718}
7719
7721 EVT VT) {
7723 "non-hsa intrinsic with hsa target",
7724 DL.getDebugLoc());
7725 DAG.getContext()->diagnose(BadIntrin);
7726 return DAG.getUNDEF(VT);
7727}
7728
7730 EVT VT) {
7732 "intrinsic not supported on subtarget",
7733 DL.getDebugLoc());
7734 DAG.getContext()->diagnose(BadIntrin);
7735 return DAG.getUNDEF(VT);
7736}
7737
7739 ArrayRef<SDValue> Elts) {
7740 assert(!Elts.empty());
7741 MVT Type;
7742 unsigned NumElts = Elts.size();
7743
7744 if (NumElts <= 12) {
7745 Type = MVT::getVectorVT(MVT::f32, NumElts);
7746 } else {
7747 assert(Elts.size() <= 16);
7748 Type = MVT::v16f32;
7749 NumElts = 16;
7750 }
7751
7752 SmallVector<SDValue, 16> VecElts(NumElts);
7753 for (unsigned i = 0; i < Elts.size(); ++i) {
7754 SDValue Elt = Elts[i];
7755 if (Elt.getValueType() != MVT::f32)
7756 Elt = DAG.getBitcast(MVT::f32, Elt);
7757 VecElts[i] = Elt;
7758 }
7759 for (unsigned i = Elts.size(); i < NumElts; ++i)
7760 VecElts[i] = DAG.getUNDEF(MVT::f32);
7761
7762 if (NumElts == 1)
7763 return VecElts[0];
7764 return DAG.getBuildVector(Type, DL, VecElts);
7765}
7766
7767static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT,
7768 SDValue Src, int ExtraElts) {
7769 EVT SrcVT = Src.getValueType();
7770
7772
7773 if (SrcVT.isVector())
7774 DAG.ExtractVectorElements(Src, Elts);
7775 else
7776 Elts.push_back(Src);
7777
7778 SDValue Undef = DAG.getUNDEF(SrcVT.getScalarType());
7779 while (ExtraElts--)
7780 Elts.push_back(Undef);
7781
7782 return DAG.getBuildVector(CastVT, DL, Elts);
7783}
7784
7785// Re-construct the required return value for a image load intrinsic.
7786// This is more complicated due to the optional use TexFailCtrl which means the required
7787// return type is an aggregate
7789 ArrayRef<EVT> ResultTypes, bool IsTexFail,
7790 bool Unpacked, bool IsD16, int DMaskPop,
7791 int NumVDataDwords, bool IsAtomicPacked16Bit,
7792 const SDLoc &DL) {
7793 // Determine the required return type. This is the same regardless of IsTexFail flag
7794 EVT ReqRetVT = ResultTypes[0];
7795 int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
7796 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
7797 ? (ReqRetNumElts + 1) / 2
7798 : ReqRetNumElts;
7799
7800 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
7801
7802 MVT DataDwordVT = NumDataDwords == 1 ?
7803 MVT::i32 : MVT::getVectorVT(MVT::i32, NumDataDwords);
7804
7805 MVT MaskPopVT = MaskPopDwords == 1 ?
7806 MVT::i32 : MVT::getVectorVT(MVT::i32, MaskPopDwords);
7807
7808 SDValue Data(Result, 0);
7809 SDValue TexFail;
7810
7811 if (DMaskPop > 0 && Data.getValueType() != MaskPopVT) {
7812 SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32);
7813 if (MaskPopVT.isVector()) {
7814 Data = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskPopVT,
7815 SDValue(Result, 0), ZeroIdx);
7816 } else {
7817 Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskPopVT,
7818 SDValue(Result, 0), ZeroIdx);
7819 }
7820 }
7821
7822 if (DataDwordVT.isVector() && !IsAtomicPacked16Bit)
7823 Data = padEltsToUndef(DAG, DL, DataDwordVT, Data,
7824 NumDataDwords - MaskPopDwords);
7825
7826 if (IsD16)
7827 Data = adjustLoadValueTypeImpl(Data, ReqRetVT, DL, DAG, Unpacked);
7828
7829 EVT LegalReqRetVT = ReqRetVT;
7830 if (!ReqRetVT.isVector()) {
7831 if (!Data.getValueType().isInteger())
7832 Data = DAG.getNode(ISD::BITCAST, DL,
7833 Data.getValueType().changeTypeToInteger(), Data);
7834 Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data);
7835 } else {
7836 // We need to widen the return vector to a legal type
7837 if ((ReqRetVT.getVectorNumElements() % 2) == 1 &&
7838 ReqRetVT.getVectorElementType().getSizeInBits() == 16) {
7839 LegalReqRetVT =
7841 ReqRetVT.getVectorNumElements() + 1);
7842 }
7843 }
7844 Data = DAG.getNode(ISD::BITCAST, DL, LegalReqRetVT, Data);
7845
7846 if (IsTexFail) {
7847 TexFail =
7848 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SDValue(Result, 0),
7849 DAG.getConstant(MaskPopDwords, DL, MVT::i32));
7850
7851 return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL);
7852 }
7853
7854 if (Result->getNumValues() == 1)
7855 return Data;
7856
7857 return DAG.getMergeValues({Data, SDValue(Result, 1)}, DL);
7858}
7859
7860static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
7861 SDValue *LWE, bool &IsTexFail) {
7862 auto TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.getNode());
7863
7864 uint64_t Value = TexFailCtrlConst->getZExtValue();
7865 if (Value) {
7866 IsTexFail = true;
7867 }
7868
7869 SDLoc DL(TexFailCtrlConst);
7870 *TFE = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
7871 Value &= ~(uint64_t)0x1;
7872 *LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
7873 Value &= ~(uint64_t)0x2;
7874
7875 return Value == 0;
7876}
7877
7879 MVT PackVectorVT,
7880 SmallVectorImpl<SDValue> &PackedAddrs,
7881 unsigned DimIdx, unsigned EndIdx,
7882 unsigned NumGradients) {
7883 SDLoc DL(Op);
7884 for (unsigned I = DimIdx; I < EndIdx; I++) {
7885 SDValue Addr = Op.getOperand(I);
7886
7887 // Gradients are packed with undef for each coordinate.
7888 // In <hi 16 bit>,<lo 16 bit> notation, the registers look like this:
7889 // 1D: undef,dx/dh; undef,dx/dv
7890 // 2D: dy/dh,dx/dh; dy/dv,dx/dv
7891 // 3D: dy/dh,dx/dh; undef,dz/dh; dy/dv,dx/dv; undef,dz/dv
7892 if (((I + 1) >= EndIdx) ||
7893 ((NumGradients / 2) % 2 == 1 && (I == DimIdx + (NumGradients / 2) - 1 ||
7894 I == DimIdx + NumGradients - 1))) {
7895 if (Addr.getValueType() != MVT::i16)
7896 Addr = DAG.getBitcast(MVT::i16, Addr);
7897 Addr = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Addr);
7898 } else {
7899 Addr = DAG.getBuildVector(PackVectorVT, DL, {Addr, Op.getOperand(I + 1)});
7900 I++;
7901 }
7902 Addr = DAG.getBitcast(MVT::f32, Addr);
7903 PackedAddrs.push_back(Addr);
7904 }
7905}
7906
7907SDValue SITargetLowering::lowerImage(SDValue Op,
7909 SelectionDAG &DAG, bool WithChain) const {
7910 SDLoc DL(Op);
7912 const GCNSubtarget* ST = &MF.getSubtarget<GCNSubtarget>();
7913 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
7915 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
7916 unsigned IntrOpcode = Intr->BaseOpcode;
7917 bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget);
7918 bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
7919 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
7920
7921 SmallVector<EVT, 3> ResultTypes(Op->values());
7922 SmallVector<EVT, 3> OrigResultTypes(Op->values());
7923 bool IsD16 = false;
7924 bool IsG16 = false;
7925 bool IsA16 = false;
7926 SDValue VData;
7927 int NumVDataDwords = 0;
7928 bool AdjustRetType = false;
7929 bool IsAtomicPacked16Bit = false;
7930
7931 // Offset of intrinsic arguments
7932 const unsigned ArgOffset = WithChain ? 2 : 1;
7933
7934 unsigned DMask;
7935 unsigned DMaskLanes = 0;
7936
7937 if (BaseOpcode->Atomic) {
7938 VData = Op.getOperand(2);
7939
7940 IsAtomicPacked16Bit =
7941 (Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
7942 Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
7943
7944 bool Is64Bit = VData.getValueSizeInBits() == 64;
7945 if (BaseOpcode->AtomicX2) {
7946 SDValue VData2 = Op.getOperand(3);
7947 VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
7948 {VData, VData2});
7949 if (Is64Bit)
7950 VData = DAG.getBitcast(MVT::v4i32, VData);
7951
7952 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
7953 DMask = Is64Bit ? 0xf : 0x3;
7954 NumVDataDwords = Is64Bit ? 4 : 2;
7955 } else {
7956 DMask = Is64Bit ? 0x3 : 0x1;
7957 NumVDataDwords = Is64Bit ? 2 : 1;
7958 }
7959 } else {
7960 DMask = Op->getConstantOperandVal(ArgOffset + Intr->DMaskIndex);
7961 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
7962
7963 if (BaseOpcode->Store) {
7964 VData = Op.getOperand(2);
7965
7966 MVT StoreVT = VData.getSimpleValueType();
7967 if (StoreVT.getScalarType() == MVT::f16) {
7968 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
7969 return Op; // D16 is unsupported for this instruction
7970
7971 IsD16 = true;
7972 VData = handleD16VData(VData, DAG, true);
7973 }
7974
7975 NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
7976 } else if (!BaseOpcode->NoReturn) {
7977 // Work out the num dwords based on the dmask popcount and underlying type
7978 // and whether packing is supported.
7979 MVT LoadVT = ResultTypes[0].getSimpleVT();
7980 if (LoadVT.getScalarType() == MVT::f16) {
7981 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
7982 return Op; // D16 is unsupported for this instruction
7983
7984 IsD16 = true;
7985 }
7986
7987 // Confirm that the return type is large enough for the dmask specified
7988 if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) ||
7989 (!LoadVT.isVector() && DMaskLanes > 1))
7990 return Op;
7991
7992 // The sq block of gfx8 and gfx9 do not estimate register use correctly
7993 // for d16 image_gather4, image_gather4_l, and image_gather4_lz
7994 // instructions.
7995 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
7996 !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))
7997 NumVDataDwords = (DMaskLanes + 1) / 2;
7998 else
7999 NumVDataDwords = DMaskLanes;
8000
8001 AdjustRetType = true;
8002 }
8003 }
8004
8005 unsigned VAddrEnd = ArgOffset + Intr->VAddrEnd;
8007
8008 // Check for 16 bit addresses or derivatives and pack if true.
8009 MVT VAddrVT =
8010 Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType();
8011 MVT VAddrScalarVT = VAddrVT.getScalarType();
8012 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
8013 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
8014
8015 VAddrVT = Op.getOperand(ArgOffset + Intr->CoordStart).getSimpleValueType();
8016 VAddrScalarVT = VAddrVT.getScalarType();
8017 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
8018 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
8019
8020 // Push back extra arguments.
8021 for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) {
8022 if (IsA16 && (Op.getOperand(ArgOffset + I).getValueType() == MVT::f16)) {
8023 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
8024 // Special handling of bias when A16 is on. Bias is of type half but
8025 // occupies full 32-bit.
8026 SDValue Bias = DAG.getBuildVector(
8027 MVT::v2f16, DL,
8028 {Op.getOperand(ArgOffset + I), DAG.getUNDEF(MVT::f16)});
8029 VAddrs.push_back(Bias);
8030 } else {
8031 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
8032 "Bias needs to be converted to 16 bit in A16 mode");
8033 VAddrs.push_back(Op.getOperand(ArgOffset + I));
8034 }
8035 }
8036
8037 if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) {
8038 // 16 bit gradients are supported, but are tied to the A16 control
8039 // so both gradients and addresses must be 16 bit
8040 LLVM_DEBUG(
8041 dbgs() << "Failed to lower image intrinsic: 16 bit addresses "
8042 "require 16 bit args for both gradients and addresses");
8043 return Op;
8044 }
8045
8046 if (IsA16) {
8047 if (!ST->hasA16()) {
8048 LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
8049 "support 16 bit addresses\n");
8050 return Op;
8051 }
8052 }
8053
8054 // We've dealt with incorrect input so we know that if IsA16, IsG16
8055 // are set then we have to compress/pack operands (either address,
8056 // gradient or both)
8057 // In the case where a16 and gradients are tied (no G16 support) then we
8058 // have already verified that both IsA16 and IsG16 are true
8059 if (BaseOpcode->Gradients && IsG16 && ST->hasG16()) {
8060 // Activate g16
8061 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
8063 IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16
8064 }
8065
8066 // Add gradients (packed or unpacked)
8067 if (IsG16) {
8068 // Pack the gradients
8069 // const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart);
8070 packImage16bitOpsToDwords(DAG, Op, GradPackVectorVT, VAddrs,
8071 ArgOffset + Intr->GradientStart,
8072 ArgOffset + Intr->CoordStart, Intr->NumGradients);
8073 } else {
8074 for (unsigned I = ArgOffset + Intr->GradientStart;
8075 I < ArgOffset + Intr->CoordStart; I++)
8076 VAddrs.push_back(Op.getOperand(I));
8077 }
8078
8079 // Add addresses (packed or unpacked)
8080 if (IsA16) {
8081 packImage16bitOpsToDwords(DAG, Op, AddrPackVectorVT, VAddrs,
8082 ArgOffset + Intr->CoordStart, VAddrEnd,
8083 0 /* No gradients */);
8084 } else {
8085 // Add uncompressed address
8086 for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++)
8087 VAddrs.push_back(Op.getOperand(I));
8088 }
8089
8090 // If the register allocator cannot place the address registers contiguously
8091 // without introducing moves, then using the non-sequential address encoding
8092 // is always preferable, since it saves VALU instructions and is usually a
8093 // wash in terms of code size or even better.
8094 //
8095 // However, we currently have no way of hinting to the register allocator that
8096 // MIMG addresses should be placed contiguously when it is possible to do so,
8097 // so force non-NSA for the common 2-address case as a heuristic.
8098 //
8099 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
8100 // allocation when possible.
8101 //
8102 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
8103 // set of the remaining addresses.
8104 const unsigned NSAMaxSize = ST->getNSAMaxSize(BaseOpcode->Sampler);
8105 const bool HasPartialNSAEncoding = ST->hasPartialNSAEncoding();
8106 const bool UseNSA = ST->hasNSAEncoding() &&
8107 VAddrs.size() >= ST->getNSAThreshold(MF) &&
8108 (VAddrs.size() <= NSAMaxSize || HasPartialNSAEncoding);
8109 const bool UsePartialNSA =
8110 UseNSA && HasPartialNSAEncoding && VAddrs.size() > NSAMaxSize;
8111
8112 SDValue VAddr;
8113 if (UsePartialNSA) {
8114 VAddr = getBuildDwordsVector(DAG, DL,
8115 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
8116 }
8117 else if (!UseNSA) {
8118 VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
8119 }
8120
8121 SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
8122 SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
8123 SDValue Unorm;
8124 if (!BaseOpcode->Sampler) {
8125 Unorm = True;
8126 } else {
8127 uint64_t UnormConst =
8128 Op.getConstantOperandVal(ArgOffset + Intr->UnormIndex);
8129
8130 Unorm = UnormConst ? True : False;
8131 }
8132
8133 SDValue TFE;
8134 SDValue LWE;
8135 SDValue TexFail = Op.getOperand(ArgOffset + Intr->TexFailCtrlIndex);
8136 bool IsTexFail = false;
8137 if (!parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
8138 return Op;
8139
8140 if (IsTexFail) {
8141 if (!DMaskLanes) {
8142 // Expecting to get an error flag since TFC is on - and dmask is 0
8143 // Force dmask to be at least 1 otherwise the instruction will fail
8144 DMask = 0x1;
8145 DMaskLanes = 1;
8146 NumVDataDwords = 1;
8147 }
8148 NumVDataDwords += 1;
8149 AdjustRetType = true;
8150 }
8151
8152 // Has something earlier tagged that the return type needs adjusting
8153 // This happens if the instruction is a load or has set TexFailCtrl flags
8154 if (AdjustRetType) {
8155 // NumVDataDwords reflects the true number of dwords required in the return type
8156 if (DMaskLanes == 0 && !BaseOpcode->Store) {
8157 // This is a no-op load. This can be eliminated
8158 SDValue Undef = DAG.getUNDEF(Op.getValueType());
8159 if (isa<MemSDNode>(Op))
8160 return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
8161 return Undef;
8162 }
8163
8164 EVT NewVT = NumVDataDwords > 1 ?
8165 EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumVDataDwords)
8166 : MVT::i32;
8167
8168 ResultTypes[0] = NewVT;
8169 if (ResultTypes.size() == 3) {
8170 // Original result was aggregate type used for TexFailCtrl results
8171 // The actual instruction returns as a vector type which has now been
8172 // created. Remove the aggregate result.
8173 ResultTypes.erase(&ResultTypes[1]);
8174 }
8175 }
8176
8177 unsigned CPol = Op.getConstantOperandVal(ArgOffset + Intr->CachePolicyIndex);
8178 if (BaseOpcode->Atomic)
8179 CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
8180 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
8182 return Op;
8183
8185 if (BaseOpcode->Store || BaseOpcode->Atomic)
8186 Ops.push_back(VData); // vdata
8187 if (UsePartialNSA) {
8188 append_range(Ops, ArrayRef(VAddrs).take_front(NSAMaxSize - 1));
8189 Ops.push_back(VAddr);
8190 }
8191 else if (UseNSA)
8192 append_range(Ops, VAddrs);
8193 else
8194 Ops.push_back(VAddr);
8195 Ops.push_back(Op.getOperand(ArgOffset + Intr->RsrcIndex));
8196 if (BaseOpcode->Sampler)
8197 Ops.push_back(Op.getOperand(ArgOffset + Intr->SampIndex));
8198 Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
8199 if (IsGFX10Plus)
8200 Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32));
8201 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
8202 Ops.push_back(Unorm);
8203 Ops.push_back(DAG.getTargetConstant(CPol, DL, MVT::i32));
8204 Ops.push_back(IsA16 && // r128, a16 for gfx9
8205 ST->hasFeature(AMDGPU::FeatureR128A16) ? True : False);
8206 if (IsGFX10Plus)
8207 Ops.push_back(IsA16 ? True : False);
8208 if (!Subtarget->hasGFX90AInsts()) {
8209 Ops.push_back(TFE); //tfe
8210 } else if (TFE->getAsZExtVal()) {
8211 report_fatal_error("TFE is not supported on this GPU");
8212 }
8213 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
8214 Ops.push_back(LWE); // lwe
8215 if (!IsGFX10Plus)
8216 Ops.push_back(DimInfo->DA ? True : False);
8217 if (BaseOpcode->HasD16)
8218 Ops.push_back(IsD16 ? True : False);
8219 if (isa<MemSDNode>(Op))
8220 Ops.push_back(Op.getOperand(0)); // chain
8221
8222 int NumVAddrDwords =
8223 UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32;
8224 int Opcode = -1;
8225
8226 if (IsGFX12Plus) {
8227 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
8228 NumVDataDwords, NumVAddrDwords);
8229 } else if (IsGFX11Plus) {
8230 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
8231 UseNSA ? AMDGPU::MIMGEncGfx11NSA
8232 : AMDGPU::MIMGEncGfx11Default,
8233 NumVDataDwords, NumVAddrDwords);
8234 } else if (IsGFX10Plus) {
8235 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
8236 UseNSA ? AMDGPU::MIMGEncGfx10NSA
8237 : AMDGPU::MIMGEncGfx10Default,
8238 NumVDataDwords, NumVAddrDwords);
8239 } else {
8240 if (Subtarget->hasGFX90AInsts()) {
8241 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
8242 NumVDataDwords, NumVAddrDwords);
8243 if (Opcode == -1)
8245 "requested image instruction is not supported on this GPU");
8246 }
8247 if (Opcode == -1 &&
8249 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
8250 NumVDataDwords, NumVAddrDwords);
8251 if (Opcode == -1)
8252 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
8253 NumVDataDwords, NumVAddrDwords);
8254 }
8255 if (Opcode == -1)
8256 return Op;
8257
8258 MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops);
8259 if (auto MemOp = dyn_cast<MemSDNode>(Op)) {
8260 MachineMemOperand *MemRef = MemOp->getMemOperand();
8261 DAG.setNodeMemRefs(NewNode, {MemRef});
8262 }
8263
8264 if (BaseOpcode->AtomicX2) {
8266 DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
8267 return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
8268 }
8269 if (BaseOpcode->NoReturn)
8270 return SDValue(NewNode, 0);
8271 return constructRetValue(DAG, NewNode, OrigResultTypes, IsTexFail,
8272 Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
8273 NumVDataDwords, IsAtomicPacked16Bit, DL);
8274}
8275
8276SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
8277 SDValue Offset, SDValue CachePolicy,
8278 SelectionDAG &DAG) const {
8280
8281 const DataLayout &DataLayout = DAG.getDataLayout();
8282 Align Alignment =
8284
8289 VT.getStoreSize(), Alignment);
8290
8291 if (!Offset->isDivergent()) {
8292 SDValue Ops[] = {Rsrc, Offset, CachePolicy};
8293
8294 // Lower llvm.amdgcn.s.buffer.load.{i16, u16} intrinsics. Initially, the
8295 // s_buffer_load_u16 instruction is emitted for both signed and unsigned
8296 // loads. Later, DAG combiner tries to combine s_buffer_load_u16 with sext
8297 // and generates s_buffer_load_i16 (performSignExtendInRegCombine).
8298 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
8299 SDValue BufferLoad =
8301 DAG.getVTList(MVT::i32), Ops, VT, MMO);
8302 return DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
8303 }
8304
8305 // Widen vec3 load to vec4.
8306 if (VT.isVector() && VT.getVectorNumElements() == 3 &&
8307 !Subtarget->hasScalarDwordx3Loads()) {
8308 EVT WidenedVT =
8310 auto WidenedOp = DAG.getMemIntrinsicNode(
8311 AMDGPUISD::SBUFFER_LOAD, DL, DAG.getVTList(WidenedVT), Ops, WidenedVT,
8312 MF.getMachineMemOperand(MMO, 0, WidenedVT.getStoreSize()));
8313 auto Subvector = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, WidenedOp,
8314 DAG.getVectorIdxConstant(0, DL));
8315 return Subvector;
8316 }
8317
8319 DAG.getVTList(VT), Ops, VT, MMO);
8320 }
8321
8322 // We have a divergent offset. Emit a MUBUF buffer load instead. We can
8323 // assume that the buffer is unswizzled.
8324 SDValue Ops[] = {
8325 DAG.getEntryNode(), // Chain
8326 Rsrc, // rsrc
8327 DAG.getConstant(0, DL, MVT::i32), // vindex
8328 {}, // voffset
8329 {}, // soffset
8330 {}, // offset
8331 CachePolicy, // cachepolicy
8332 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
8333 };
8334 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
8335 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
8336 return handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
8337 }
8338
8340 unsigned NumLoads = 1;
8341 MVT LoadVT = VT.getSimpleVT();
8342 unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
8343 assert((LoadVT.getScalarType() == MVT::i32 ||
8344 LoadVT.getScalarType() == MVT::f32));
8345
8346 if (NumElts == 8 || NumElts == 16) {
8347 NumLoads = NumElts / 4;
8348 LoadVT = MVT::getVectorVT(LoadVT.getScalarType(), 4);
8349 }
8350
8351 SDVTList VTList = DAG.getVTList({LoadVT, MVT::Glue});
8352
8353 // Use the alignment to ensure that the required offsets will fit into the
8354 // immediate offsets.
8355 setBufferOffsets(Offset, DAG, &Ops[3],
8356 NumLoads > 1 ? Align(16 * NumLoads) : Align(4));
8357
8358 uint64_t InstOffset = Ops[5]->getAsZExtVal();
8359 for (unsigned i = 0; i < NumLoads; ++i) {
8360 Ops[5] = DAG.getTargetConstant(InstOffset + 16 * i, DL, MVT::i32);
8361 Loads.push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops,
8362 LoadVT, MMO, DAG));
8363 }
8364
8365 if (NumElts == 8 || NumElts == 16)
8366 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads);
8367
8368 return Loads[0];
8369}
8370
8371SDValue SITargetLowering::lowerWaveID(SelectionDAG &DAG, SDValue Op) const {
8372 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
8373 if (!Subtarget->hasArchitectedSGPRs())
8374 return {};
8375 SDLoc SL(Op);
8376 MVT VT = MVT::i32;
8377 SDValue TTMP8 = DAG.getCopyFromReg(DAG.getEntryNode(), SL, AMDGPU::TTMP8, VT);
8378 return DAG.getNode(AMDGPUISD::BFE_U32, SL, VT, TTMP8,
8379 DAG.getConstant(25, SL, VT), DAG.getConstant(5, SL, VT));
8380}
8381
8382SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op,
8383 unsigned Dim,
8384 const ArgDescriptor &Arg) const {
8385 SDLoc SL(Op);
8387 unsigned MaxID = Subtarget->getMaxWorkitemID(MF.getFunction(), Dim);
8388 if (MaxID == 0)
8389 return DAG.getConstant(0, SL, MVT::i32);
8390
8391 SDValue Val = loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
8392 SDLoc(DAG.getEntryNode()), Arg);
8393
8394 // Don't bother inserting AssertZext for packed IDs since we're emitting the
8395 // masking operations anyway.
8396 //
8397 // TODO: We could assert the top bit is 0 for the source copy.
8398 if (Arg.isMasked())
8399 return Val;
8400
8401 // Preserve the known bits after expansion to a copy.
8403 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Val,
8404 DAG.getValueType(SmallVT));
8405}
8406
8407SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
8408 SelectionDAG &DAG) const {
8410 auto MFI = MF.getInfo<SIMachineFunctionInfo>();
8411
8412 EVT VT = Op.getValueType();
8413 SDLoc DL(Op);
8414 unsigned IntrinsicID = Op.getConstantOperandVal(0);
8415
8416 // TODO: Should this propagate fast-math-flags?
8417
8418 switch (IntrinsicID) {
8419 case Intrinsic::amdgcn_implicit_buffer_ptr: {
8420 if (getSubtarget()->isAmdHsaOrMesa(MF.getFunction()))
8421 return emitNonHSAIntrinsicError(DAG, DL, VT);
8422 return getPreloadedValue(DAG, *MFI, VT,
8424 }
8425 case Intrinsic::amdgcn_dispatch_ptr:
8426 case Intrinsic::amdgcn_queue_ptr: {
8427 if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) {
8428 DiagnosticInfoUnsupported BadIntrin(
8429 MF.getFunction(), "unsupported hsa intrinsic without hsa target",
8430 DL.getDebugLoc());
8431 DAG.getContext()->diagnose(BadIntrin);
8432 return DAG.getUNDEF(VT);
8433 }
8434
8435 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ?
8437 return getPreloadedValue(DAG, *MFI, VT, RegID);
8438 }
8439 case Intrinsic::amdgcn_implicitarg_ptr: {
8440 if (MFI->isEntryFunction())
8441 return getImplicitArgPtr(DAG, DL);
8442 return getPreloadedValue(DAG, *MFI, VT,
8444 }
8445 case Intrinsic::amdgcn_kernarg_segment_ptr: {
8447 // This only makes sense to call in a kernel, so just lower to null.
8448 return DAG.getConstant(0, DL, VT);
8449 }
8450
8451 return getPreloadedValue(DAG, *MFI, VT,
8453 }
8454 case Intrinsic::amdgcn_dispatch_id: {
8455 return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);
8456 }
8457 case Intrinsic::amdgcn_rcp:
8458 return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
8459 case Intrinsic::amdgcn_rsq:
8460 return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
8461 case Intrinsic::amdgcn_rsq_legacy:
8463 return emitRemovedIntrinsicError(DAG, DL, VT);
8464 return SDValue();
8465 case Intrinsic::amdgcn_rcp_legacy:
8467 return emitRemovedIntrinsicError(DAG, DL, VT);
8468 return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
8469 case Intrinsic::amdgcn_rsq_clamp: {
8471 return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
8472
8473 Type *Type = VT.getTypeForEVT(*DAG.getContext());
8476
8477 SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
8478 SDValue Tmp = DAG.getNode(ISD::FMINNUM, DL, VT, Rsq,
8479 DAG.getConstantFP(Max, DL, VT));
8480 return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
8481 DAG.getConstantFP(Min, DL, VT));
8482 }
8483 case Intrinsic::r600_read_ngroups_x:
8484 if (Subtarget->isAmdHsaOS())
8485 return emitNonHSAIntrinsicError(DAG, DL, VT);
8486
8487 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8489 false);
8490 case Intrinsic::r600_read_ngroups_y:
8491 if (Subtarget->isAmdHsaOS())
8492 return emitNonHSAIntrinsicError(DAG, DL, VT);
8493
8494 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8496 false);
8497 case Intrinsic::r600_read_ngroups_z:
8498 if (Subtarget->isAmdHsaOS())
8499 return emitNonHSAIntrinsicError(DAG, DL, VT);
8500
8501 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8503 false);
8504 case Intrinsic::r600_read_global_size_x:
8505 if (Subtarget->isAmdHsaOS())
8506 return emitNonHSAIntrinsicError(DAG, DL, VT);
8507
8508 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8510 Align(4), false);
8511 case Intrinsic::r600_read_global_size_y:
8512 if (Subtarget->isAmdHsaOS())
8513 return emitNonHSAIntrinsicError(DAG, DL, VT);
8514
8515 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8517 Align(4), false);
8518 case Intrinsic::r600_read_global_size_z:
8519 if (Subtarget->isAmdHsaOS())
8520 return emitNonHSAIntrinsicError(DAG, DL, VT);
8521
8522 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8524 Align(4), false);
8525 case Intrinsic::r600_read_local_size_x:
8526 if (Subtarget->isAmdHsaOS())
8527 return emitNonHSAIntrinsicError(DAG, DL, VT);
8528
8529 return lowerImplicitZextParam(DAG, Op, MVT::i16,
8531 case Intrinsic::r600_read_local_size_y:
8532 if (Subtarget->isAmdHsaOS())
8533 return emitNonHSAIntrinsicError(DAG, DL, VT);
8534
8535 return lowerImplicitZextParam(DAG, Op, MVT::i16,
8537 case Intrinsic::r600_read_local_size_z:
8538 if (Subtarget->isAmdHsaOS())
8539 return emitNonHSAIntrinsicError(DAG, DL, VT);
8540
8541 return lowerImplicitZextParam(DAG, Op, MVT::i16,
8543 case Intrinsic::amdgcn_workgroup_id_x:
8544 return getPreloadedValue(DAG, *MFI, VT,
8546 case Intrinsic::amdgcn_workgroup_id_y:
8547 return getPreloadedValue(DAG, *MFI, VT,
8549 case Intrinsic::amdgcn_workgroup_id_z:
8550 return getPreloadedValue(DAG, *MFI, VT,
8552 case Intrinsic::amdgcn_wave_id:
8553 return lowerWaveID(DAG, Op);
8554 case Intrinsic::amdgcn_lds_kernel_id: {
8555 if (MFI->isEntryFunction())
8556 return getLDSKernelId(DAG, DL);
8557 return getPreloadedValue(DAG, *MFI, VT,
8559 }
8560 case Intrinsic::amdgcn_workitem_id_x:
8561 return lowerWorkitemID(DAG, Op, 0, MFI->getArgInfo().WorkItemIDX);
8562 case Intrinsic::amdgcn_workitem_id_y:
8563 return lowerWorkitemID(DAG, Op, 1, MFI->getArgInfo().WorkItemIDY);
8564 case Intrinsic::amdgcn_workitem_id_z:
8565 return lowerWorkitemID(DAG, Op, 2, MFI->getArgInfo().WorkItemIDZ);
8566 case Intrinsic::amdgcn_wavefrontsize:
8568 SDLoc(Op), MVT::i32);
8569 case Intrinsic::amdgcn_s_buffer_load: {
8570 unsigned CPol = Op.getConstantOperandVal(3);
8571 // s_buffer_load, because of how it's optimized, can't be volatile
8572 // so reject ones with the volatile bit set.
8573 if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12)
8576 return Op;
8577 return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
8578 DAG);
8579 }
8580 case Intrinsic::amdgcn_fdiv_fast:
8581 return lowerFDIV_FAST(Op, DAG);
8582 case Intrinsic::amdgcn_sin:
8583 return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
8584
8585 case Intrinsic::amdgcn_cos:
8586 return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
8587
8588 case Intrinsic::amdgcn_mul_u24:
8589 return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, Op.getOperand(1), Op.getOperand(2));
8590 case Intrinsic::amdgcn_mul_i24:
8591 return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT, Op.getOperand(1), Op.getOperand(2));
8592
8593 case Intrinsic::amdgcn_log_clamp: {
8595 return SDValue();
8596
8597 return emitRemovedIntrinsicError(DAG, DL, VT);
8598 }
8599 case Intrinsic::amdgcn_fract:
8600 return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
8601
8602 case Intrinsic::amdgcn_class:
8603 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT,
8604 Op.getOperand(1), Op.getOperand(2));
8605 case Intrinsic::amdgcn_div_fmas:
8606 return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT,
8607 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
8608 Op.getOperand(4));
8609
8610 case Intrinsic::amdgcn_div_fixup:
8611 return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT,
8612 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
8613
8614 case Intrinsic::amdgcn_div_scale: {
8615 const ConstantSDNode *Param = cast<ConstantSDNode>(Op.getOperand(3));
8616
8617 // Translate to the operands expected by the machine instruction. The
8618 // first parameter must be the same as the first instruction.
8619 SDValue Numerator = Op.getOperand(1);
8620 SDValue Denominator = Op.getOperand(2);
8621
8622 // Note this order is opposite of the machine instruction's operations,
8623 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
8624 // intrinsic has the numerator as the first operand to match a normal
8625 // division operation.
8626
8627 SDValue Src0 = Param->isAllOnes() ? Numerator : Denominator;
8628
8629 return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
8630 Denominator, Numerator);
8631 }
8632 case Intrinsic::amdgcn_icmp: {
8633 // There is a Pat that handles this variant, so return it as-is.
8634 if (Op.getOperand(1).getValueType() == MVT::i1 &&
8635 Op.getConstantOperandVal(2) == 0 &&
8636 Op.getConstantOperandVal(3) == ICmpInst::Predicate::ICMP_NE)
8637 return Op;
8638 return lowerICMPIntrinsic(*this, Op.getNode(), DAG);
8639 }
8640 case Intrinsic::amdgcn_fcmp: {
8641 return lowerFCMPIntrinsic(*this, Op.getNode(), DAG);
8642 }
8643 case Intrinsic::amdgcn_ballot:
8644 return lowerBALLOTIntrinsic(*this, Op.getNode(), DAG);
8645 case Intrinsic::amdgcn_fmed3:
8646 return DAG.getNode(AMDGPUISD::FMED3, DL, VT,
8647 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
8648 case Intrinsic::amdgcn_fdot2:
8649 return DAG.getNode(AMDGPUISD::FDOT2, DL, VT,
8650 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
8651 Op.getOperand(4));
8652 case Intrinsic::amdgcn_fmul_legacy:
8653 return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT,
8654 Op.getOperand(1), Op.getOperand(2));
8655 case Intrinsic::amdgcn_sffbh:
8656 return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
8657 case Intrinsic::amdgcn_sbfe:
8658 return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT,
8659 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
8660 case Intrinsic::amdgcn_ubfe:
8661 return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT,
8662 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
8663 case Intrinsic::amdgcn_cvt_pkrtz:
8664 case Intrinsic::amdgcn_cvt_pknorm_i16:
8665 case Intrinsic::amdgcn_cvt_pknorm_u16:
8666 case Intrinsic::amdgcn_cvt_pk_i16:
8667 case Intrinsic::amdgcn_cvt_pk_u16: {
8668 // FIXME: Stop adding cast if v2f16/v2i16 are legal.
8669 EVT VT = Op.getValueType();
8670 unsigned Opcode;
8671
8672 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
8674 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
8676 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
8678 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
8680 else
8682
8683 if (isTypeLegal(VT))
8684 return DAG.getNode(Opcode, DL, VT, Op.getOperand(1), Op.getOperand(2));
8685
8686 SDValue Node = DAG.getNode(Opcode, DL, MVT::i32,
8687 Op.getOperand(1), Op.getOperand(2));
8688 return DAG.getNode(ISD::BITCAST, DL, VT, Node);
8689 }
8690 case Intrinsic::amdgcn_fmad_ftz:
8691 return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),
8692 Op.getOperand(2), Op.getOperand(3));
8693
8694 case Intrinsic::amdgcn_if_break:
8695 return SDValue(DAG.getMachineNode(AMDGPU::SI_IF_BREAK, DL, VT,
8696 Op->getOperand(1), Op->getOperand(2)), 0);
8697
8698 case Intrinsic::amdgcn_groupstaticsize: {
8700 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
8701 return Op;
8702
8703 const Module *M = MF.getFunction().getParent();
8704 const GlobalValue *GV =
8705 M->getNamedValue(Intrinsic::getName(Intrinsic::amdgcn_groupstaticsize));
8706 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0,
8708 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
8709 }
8710 case Intrinsic::amdgcn_is_shared:
8711 case Intrinsic::amdgcn_is_private: {
8712 SDLoc SL(Op);
8713 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared) ?
8715 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
8716 SDValue SrcVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32,
8717 Op.getOperand(1));
8718
8719 SDValue SrcHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, SrcVec,
8720 DAG.getConstant(1, SL, MVT::i32));
8721 return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ);
8722 }
8723 case Intrinsic::amdgcn_perm:
8724 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1),
8725 Op.getOperand(2), Op.getOperand(3));
8726 case Intrinsic::amdgcn_reloc_constant: {
8727 Module *M = const_cast<Module *>(MF.getFunction().getParent());
8728 const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD();
8729 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
8730 auto RelocSymbol = cast<GlobalVariable>(
8731 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
8732 SDValue GA = DAG.getTargetGlobalAddress(RelocSymbol, DL, MVT::i32, 0,
8734 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
8735 }
8736 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
8737 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
8738 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
8739 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
8740 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
8741 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
8742 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
8743 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
8744 if (Op.getOperand(4).getValueType() == MVT::i32)
8745 return SDValue();
8746
8747 SDLoc SL(Op);
8748 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i32);
8749 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
8750 Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
8751 Op.getOperand(3), IndexKeyi32);
8752 }
8753 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
8754 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
8755 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
8756 if (Op.getOperand(6).getValueType() == MVT::i32)
8757 return SDValue();
8758
8759 SDLoc SL(Op);
8760 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, MVT::i32);
8761 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
8762 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
8763 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
8764 IndexKeyi32, Op.getOperand(7)});
8765 }
8766 case Intrinsic::amdgcn_addrspacecast_nonnull:
8767 return lowerADDRSPACECAST(Op, DAG);
8768 case Intrinsic::amdgcn_readlane:
8769 case Intrinsic::amdgcn_readfirstlane:
8770 case Intrinsic::amdgcn_writelane:
8771 case Intrinsic::amdgcn_permlane16:
8772 case Intrinsic::amdgcn_permlanex16:
8773 case Intrinsic::amdgcn_permlane64:
8774 return lowerLaneOp(*this, Op.getNode(), DAG);
8775 default:
8776 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
8778 return lowerImage(Op, ImageDimIntr, DAG, false);
8779
8780 return Op;
8781 }
8782}
8783
8784// On targets not supporting constant in soffset field, turn zero to
8785// SGPR_NULL to avoid generating an extra s_mov with zero.
8787 const GCNSubtarget *Subtarget) {
8788 if (Subtarget->hasRestrictedSOffset() && isNullConstant(SOffset))
8789 return DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32);
8790 return SOffset;
8791}
8792
8793SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
8794 SelectionDAG &DAG,
8795 unsigned NewOpcode) const {
8796 SDLoc DL(Op);
8797
8798 SDValue VData = Op.getOperand(2);
8799 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
8800 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
8801 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
8802 SDValue Ops[] = {
8803 Op.getOperand(0), // Chain
8804 VData, // vdata
8805 Rsrc, // rsrc
8806 DAG.getConstant(0, DL, MVT::i32), // vindex
8807 Offsets.first, // voffset
8808 SOffset, // soffset
8809 Offsets.second, // offset
8810 Op.getOperand(6), // cachepolicy
8811 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
8812 };
8813
8814 auto *M = cast<MemSDNode>(Op);
8815
8816 EVT MemVT = VData.getValueType();
8817 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
8818 M->getMemOperand());
8819}
8820
8821SDValue
8822SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
8823 unsigned NewOpcode) const {
8824 SDLoc DL(Op);
8825
8826 SDValue VData = Op.getOperand(2);
8827 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
8828 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
8829 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
8830 SDValue Ops[] = {
8831 Op.getOperand(0), // Chain
8832 VData, // vdata
8833 Rsrc, // rsrc
8834 Op.getOperand(4), // vindex
8835 Offsets.first, // voffset
8836 SOffset, // soffset
8837 Offsets.second, // offset
8838 Op.getOperand(7), // cachepolicy
8839 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
8840 };
8841
8842 auto *M = cast<MemSDNode>(Op);
8843
8844 EVT MemVT = VData.getValueType();
8845 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
8846 M->getMemOperand());
8847}
8848
8849SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
8850 SelectionDAG &DAG) const {
8851 unsigned IntrID = Op.getConstantOperandVal(1);
8852 SDLoc DL(Op);
8853
8854 switch (IntrID) {
8855 case Intrinsic::amdgcn_ds_ordered_add:
8856 case Intrinsic::amdgcn_ds_ordered_swap: {
8857 MemSDNode *M = cast<MemSDNode>(Op);
8858 SDValue Chain = M->getOperand(0);
8859 SDValue M0 = M->getOperand(2);
8860 SDValue Value = M->getOperand(3);
8861 unsigned IndexOperand = M->getConstantOperandVal(7);
8862 unsigned WaveRelease = M->getConstantOperandVal(8);
8863 unsigned WaveDone = M->getConstantOperandVal(9);
8864
8865 unsigned OrderedCountIndex = IndexOperand & 0x3f;
8866 IndexOperand &= ~0x3f;
8867 unsigned CountDw = 0;
8868
8869 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) {
8870 CountDw = (IndexOperand >> 24) & 0xf;
8871 IndexOperand &= ~(0xf << 24);
8872
8873 if (CountDw < 1 || CountDw > 4) {
8875 "ds_ordered_count: dword count must be between 1 and 4");
8876 }
8877 }
8878
8879 if (IndexOperand)
8880 report_fatal_error("ds_ordered_count: bad index operand");
8881
8882 if (WaveDone && !WaveRelease)
8883 report_fatal_error("ds_ordered_count: wave_done requires wave_release");
8884
8885 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
8886 unsigned ShaderType =
8888 unsigned Offset0 = OrderedCountIndex << 2;
8889 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
8890
8891 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10)
8892 Offset1 |= (CountDw - 1) << 6;
8893
8894 if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX11)
8895 Offset1 |= ShaderType << 2;
8896
8897 unsigned Offset = Offset0 | (Offset1 << 8);
8898
8899 SDValue Ops[] = {
8900 Chain,
8901 Value,
8902 DAG.getTargetConstant(Offset, DL, MVT::i16),
8903 copyToM0(DAG, Chain, DL, M0).getValue(1), // Glue
8904 };
8906 M->getVTList(), Ops, M->getMemoryVT(),
8907 M->getMemOperand());
8908 }
8909 case Intrinsic::amdgcn_raw_buffer_load:
8910 case Intrinsic::amdgcn_raw_ptr_buffer_load:
8911 case Intrinsic::amdgcn_raw_atomic_buffer_load:
8912 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
8913 case Intrinsic::amdgcn_raw_buffer_load_format:
8914 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
8915 const bool IsFormat =
8916 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
8917 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
8918
8919 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
8920 auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG);
8921 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
8922 SDValue Ops[] = {
8923 Op.getOperand(0), // Chain
8924 Rsrc, // rsrc
8925 DAG.getConstant(0, DL, MVT::i32), // vindex
8926 Offsets.first, // voffset
8927 SOffset, // soffset
8928 Offsets.second, // offset
8929 Op.getOperand(5), // cachepolicy, swizzled buffer
8930 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
8931 };
8932
8933 auto *M = cast<MemSDNode>(Op);
8934 return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
8935 }
8936 case Intrinsic::amdgcn_struct_buffer_load:
8937 case Intrinsic::amdgcn_struct_ptr_buffer_load:
8938 case Intrinsic::amdgcn_struct_buffer_load_format:
8939 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
8940 case Intrinsic::amdgcn_struct_atomic_buffer_load:
8941 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
8942 const bool IsFormat =
8943 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
8944 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
8945
8946 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
8947 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
8948 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
8949 SDValue Ops[] = {
8950 Op.getOperand(0), // Chain
8951 Rsrc, // rsrc
8952 Op.getOperand(3), // vindex
8953 Offsets.first, // voffset
8954 SOffset, // soffset
8955 Offsets.second, // offset
8956 Op.getOperand(6), // cachepolicy, swizzled buffer
8957 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
8958 };
8959
8960 return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops);
8961 }
8962 case Intrinsic::amdgcn_raw_tbuffer_load:
8963 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
8964 MemSDNode *M = cast<MemSDNode>(Op);
8965 EVT LoadVT = Op.getValueType();
8966 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
8967 auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG);
8968 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
8969
8970 SDValue Ops[] = {
8971 Op.getOperand(0), // Chain
8972 Rsrc, // rsrc
8973 DAG.getConstant(0, DL, MVT::i32), // vindex
8974 Offsets.first, // voffset
8975 SOffset, // soffset
8976 Offsets.second, // offset
8977 Op.getOperand(5), // format
8978 Op.getOperand(6), // cachepolicy, swizzled buffer
8979 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
8980 };
8981
8982 if (LoadVT.getScalarType() == MVT::f16)
8983 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
8984 M, DAG, Ops);
8985 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
8986 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
8987 DAG);
8988 }
8989 case Intrinsic::amdgcn_struct_tbuffer_load:
8990 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
8991 MemSDNode *M = cast<MemSDNode>(Op);
8992 EVT LoadVT = Op.getValueType();
8993 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
8994 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
8995 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
8996
8997 SDValue Ops[] = {
8998 Op.getOperand(0), // Chain
8999 Rsrc, // rsrc
9000 Op.getOperand(3), // vindex
9001 Offsets.first, // voffset
9002 SOffset, // soffset
9003 Offsets.second, // offset
9004 Op.getOperand(6), // format
9005 Op.getOperand(7), // cachepolicy, swizzled buffer
9006 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9007 };
9008
9009 if (LoadVT.getScalarType() == MVT::f16)
9010 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
9011 M, DAG, Ops);
9012 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
9013 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
9014 DAG);
9015 }
9016 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
9017 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
9018 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
9019 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
9020 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
9021 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
9022 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
9023 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
9024 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
9025 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
9026 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
9027 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
9028 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
9029 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
9030 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
9031 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
9032 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
9033 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
9034 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
9035 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
9036 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP);
9037 case Intrinsic::amdgcn_raw_buffer_atomic_add:
9038 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
9039 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
9040 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
9041 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
9042 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
9043 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
9044 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
9045 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMIN);
9046 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
9047 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
9048 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMIN);
9049 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
9050 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
9051 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMAX);
9052 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
9053 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
9054 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMAX);
9055 case Intrinsic::amdgcn_raw_buffer_atomic_and:
9056 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
9057 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
9058 case Intrinsic::amdgcn_raw_buffer_atomic_or:
9059 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
9060 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
9061 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
9062 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
9063 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
9064 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
9065 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
9066 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
9067 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
9068 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
9069 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
9070 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
9071 return lowerRawBufferAtomicIntrin(Op, DAG,
9073 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
9074 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
9075 return lowerStructBufferAtomicIntrin(Op, DAG,
9077 case Intrinsic::amdgcn_struct_buffer_atomic_add:
9078 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
9079 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
9080 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
9081 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
9082 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
9083 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
9084 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
9085 return lowerStructBufferAtomicIntrin(Op, DAG,
9087 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
9088 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
9089 return lowerStructBufferAtomicIntrin(Op, DAG,
9091 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
9092 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
9093 return lowerStructBufferAtomicIntrin(Op, DAG,
9095 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
9096 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
9097 return lowerStructBufferAtomicIntrin(Op, DAG,
9099 case Intrinsic::amdgcn_struct_buffer_atomic_and:
9100 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
9101 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
9102 case Intrinsic::amdgcn_struct_buffer_atomic_or:
9103 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
9104 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
9105 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
9106 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
9107 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
9108 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
9109 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
9110 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
9111 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
9112 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
9113 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
9114 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
9115 return lowerStructBufferAtomicIntrin(Op, DAG,
9117
9118 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
9119 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
9120 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG);
9121 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
9122 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
9123 SDValue Ops[] = {
9124 Op.getOperand(0), // Chain
9125 Op.getOperand(2), // src
9126 Op.getOperand(3), // cmp
9127 Rsrc, // rsrc
9128 DAG.getConstant(0, DL, MVT::i32), // vindex
9129 Offsets.first, // voffset
9130 SOffset, // soffset
9131 Offsets.second, // offset
9132 Op.getOperand(7), // cachepolicy
9133 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9134 };
9135 EVT VT = Op.getValueType();
9136 auto *M = cast<MemSDNode>(Op);
9137
9139 Op->getVTList(), Ops, VT, M->getMemOperand());
9140 }
9141 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
9142 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
9143 SDValue Rsrc = bufferRsrcPtrToVector(Op->getOperand(4), DAG);
9144 auto Offsets = splitBufferOffsets(Op.getOperand(6), DAG);
9145 auto SOffset = selectSOffset(Op.getOperand(7), DAG, Subtarget);
9146 SDValue Ops[] = {
9147 Op.getOperand(0), // Chain
9148 Op.getOperand(2), // src
9149 Op.getOperand(3), // cmp
9150 Rsrc, // rsrc
9151 Op.getOperand(5), // vindex
9152 Offsets.first, // voffset
9153 SOffset, // soffset
9154 Offsets.second, // offset
9155 Op.getOperand(8), // cachepolicy
9156 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9157 };
9158 EVT VT = Op.getValueType();
9159 auto *M = cast<MemSDNode>(Op);
9160
9162 Op->getVTList(), Ops, VT, M->getMemOperand());
9163 }
9164 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
9165 MemSDNode *M = cast<MemSDNode>(Op);
9166 SDValue NodePtr = M->getOperand(2);
9167 SDValue RayExtent = M->getOperand(3);
9168 SDValue RayOrigin = M->getOperand(4);
9169 SDValue RayDir = M->getOperand(5);
9170 SDValue RayInvDir = M->getOperand(6);
9171 SDValue TDescr = M->getOperand(7);
9172
9173 assert(NodePtr.getValueType() == MVT::i32 ||
9174 NodePtr.getValueType() == MVT::i64);
9175 assert(RayDir.getValueType() == MVT::v3f16 ||
9176 RayDir.getValueType() == MVT::v3f32);
9177
9178 if (!Subtarget->hasGFX10_AEncoding()) {
9179 emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
9180 return SDValue();
9181 }
9182
9183 const bool IsGFX11 = AMDGPU::isGFX11(*Subtarget);
9184 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
9185 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
9186 const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
9187 const bool Is64 = NodePtr.getValueType() == MVT::i64;
9188 const unsigned NumVDataDwords = 4;
9189 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
9190 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
9191 const bool UseNSA = (Subtarget->hasNSAEncoding() &&
9192 NumVAddrs <= Subtarget->getNSAMaxSize()) ||
9193 IsGFX12Plus;
9194 const unsigned BaseOpcodes[2][2] = {
9195 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
9196 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
9197 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
9198 int Opcode;
9199 if (UseNSA) {
9200 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
9201 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
9202 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
9203 : AMDGPU::MIMGEncGfx10NSA,
9204 NumVDataDwords, NumVAddrDwords);
9205 } else {
9206 assert(!IsGFX12Plus);
9207 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
9208 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
9209 : AMDGPU::MIMGEncGfx10Default,
9210 NumVDataDwords, NumVAddrDwords);
9211 }
9212 assert(Opcode != -1);
9213
9215
9216 auto packLanes = [&DAG, &Ops, &DL] (SDValue Op, bool IsAligned) {
9218 DAG.ExtractVectorElements(Op, Lanes, 0, 3);
9219 if (Lanes[0].getValueSizeInBits() == 32) {
9220 for (unsigned I = 0; I < 3; ++I)
9221 Ops.push_back(DAG.getBitcast(MVT::i32, Lanes[I]));
9222 } else {
9223 if (IsAligned) {
9224 Ops.push_back(
9225 DAG.getBitcast(MVT::i32,
9226 DAG.getBuildVector(MVT::v2f16, DL,
9227 { Lanes[0], Lanes[1] })));
9228 Ops.push_back(Lanes[2]);
9229 } else {
9230 SDValue Elt0 = Ops.pop_back_val();
9231 Ops.push_back(
9232 DAG.getBitcast(MVT::i32,
9233 DAG.getBuildVector(MVT::v2f16, DL,
9234 { Elt0, Lanes[0] })));
9235 Ops.push_back(
9236 DAG.getBitcast(MVT::i32,
9237 DAG.getBuildVector(MVT::v2f16, DL,
9238 { Lanes[1], Lanes[2] })));
9239 }
9240 }
9241 };
9242
9243 if (UseNSA && IsGFX11Plus) {
9244 Ops.push_back(NodePtr);
9245 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
9246 Ops.push_back(RayOrigin);
9247 if (IsA16) {
9248 SmallVector<SDValue, 3> DirLanes, InvDirLanes, MergedLanes;
9249 DAG.ExtractVectorElements(RayDir, DirLanes, 0, 3);
9250 DAG.ExtractVectorElements(RayInvDir, InvDirLanes, 0, 3);
9251 for (unsigned I = 0; I < 3; ++I) {
9252 MergedLanes.push_back(DAG.getBitcast(
9253 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL,
9254 {DirLanes[I], InvDirLanes[I]})));
9255 }
9256 Ops.push_back(DAG.getBuildVector(MVT::v3i32, DL, MergedLanes));
9257 } else {
9258 Ops.push_back(RayDir);
9259 Ops.push_back(RayInvDir);
9260 }
9261 } else {
9262 if (Is64)
9263 DAG.ExtractVectorElements(DAG.getBitcast(MVT::v2i32, NodePtr), Ops, 0,
9264 2);
9265 else
9266 Ops.push_back(NodePtr);
9267
9268 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
9269 packLanes(RayOrigin, true);
9270 packLanes(RayDir, true);
9271 packLanes(RayInvDir, false);
9272 }
9273
9274 if (!UseNSA) {
9275 // Build a single vector containing all the operands so far prepared.
9276 if (NumVAddrDwords > 12) {
9277 SDValue Undef = DAG.getUNDEF(MVT::i32);
9278 Ops.append(16 - Ops.size(), Undef);
9279 }
9280 assert(Ops.size() >= 8 && Ops.size() <= 12);
9281 SDValue MergedOps = DAG.getBuildVector(
9282 MVT::getVectorVT(MVT::i32, Ops.size()), DL, Ops);
9283 Ops.clear();
9284 Ops.push_back(MergedOps);
9285 }
9286
9287 Ops.push_back(TDescr);
9288 Ops.push_back(DAG.getTargetConstant(IsA16, DL, MVT::i1));
9289 Ops.push_back(M->getChain());
9290
9291 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
9292 MachineMemOperand *MemRef = M->getMemOperand();
9293 DAG.setNodeMemRefs(NewNode, {MemRef});
9294 return SDValue(NewNode, 0);
9295 }
9296 case Intrinsic::amdgcn_global_atomic_fmin:
9297 case Intrinsic::amdgcn_global_atomic_fmax:
9298 case Intrinsic::amdgcn_global_atomic_fmin_num:
9299 case Intrinsic::amdgcn_global_atomic_fmax_num:
9300 case Intrinsic::amdgcn_flat_atomic_fmin:
9301 case Intrinsic::amdgcn_flat_atomic_fmax:
9302 case Intrinsic::amdgcn_flat_atomic_fmin_num:
9303 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9304 MemSDNode *M = cast<MemSDNode>(Op);
9305 SDValue Ops[] = {
9306 M->getOperand(0), // Chain
9307 M->getOperand(2), // Ptr
9308 M->getOperand(3) // Value
9309 };
9310 unsigned Opcode = 0;
9311 switch (IntrID) {
9312 case Intrinsic::amdgcn_global_atomic_fmin:
9313 case Intrinsic::amdgcn_global_atomic_fmin_num:
9314 case Intrinsic::amdgcn_flat_atomic_fmin:
9315 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
9316 Opcode = ISD::ATOMIC_LOAD_FMIN;
9317 break;
9318 }
9319 case Intrinsic::amdgcn_global_atomic_fmax:
9320 case Intrinsic::amdgcn_global_atomic_fmax_num:
9321 case Intrinsic::amdgcn_flat_atomic_fmax:
9322 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9323 Opcode = ISD::ATOMIC_LOAD_FMAX;
9324 break;
9325 }
9326 default:
9327 llvm_unreachable("unhandled atomic opcode");
9328 }
9329 return DAG.getAtomic(Opcode, SDLoc(Op), M->getMemoryVT(), M->getVTList(),
9330 Ops, M->getMemOperand());
9331 }
9332 case Intrinsic::amdgcn_s_get_barrier_state: {
9333 SDValue Chain = Op->getOperand(0);
9335 unsigned Opc;
9336 bool IsInlinableBarID = false;
9337 int64_t BarID;
9338
9339 if (isa<ConstantSDNode>(Op->getOperand(2))) {
9340 BarID = cast<ConstantSDNode>(Op->getOperand(2))->getSExtValue();
9341 IsInlinableBarID = AMDGPU::isInlinableIntLiteral(BarID);
9342 }
9343
9344 if (IsInlinableBarID) {
9345 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
9346 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
9347 Ops.push_back(K);
9348 } else {
9349 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
9350 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(2));
9351 Ops.push_back(M0Val.getValue(0));
9352 }
9353
9354 auto NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
9355 return SDValue(NewMI, 0);
9356 }
9357 default:
9358
9359 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
9361 return lowerImage(Op, ImageDimIntr, DAG, true);
9362
9363 return SDValue();
9364 }
9365}
9366
9367// Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
9368// dwordx4 if on SI and handle TFE loads.
9369SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,
9370 SDVTList VTList,
9371 ArrayRef<SDValue> Ops, EVT MemVT,
9372 MachineMemOperand *MMO,
9373 SelectionDAG &DAG) const {
9374 LLVMContext &C = *DAG.getContext();
9376 EVT VT = VTList.VTs[0];
9377
9378 assert(VTList.NumVTs == 2 || VTList.NumVTs == 3);
9379 bool IsTFE = VTList.NumVTs == 3;
9380 if (IsTFE) {
9381 unsigned NumValueDWords = divideCeil(VT.getSizeInBits(), 32);
9382 unsigned NumOpDWords = NumValueDWords + 1;
9383 EVT OpDWordsVT = EVT::getVectorVT(C, MVT::i32, NumOpDWords);
9384 SDVTList OpDWordsVTList = DAG.getVTList(OpDWordsVT, VTList.VTs[2]);
9385 MachineMemOperand *OpDWordsMMO =
9386 MF.getMachineMemOperand(MMO, 0, NumOpDWords * 4);
9387 SDValue Op = getMemIntrinsicNode(Opcode, DL, OpDWordsVTList, Ops,
9388 OpDWordsVT, OpDWordsMMO, DAG);
9390 DAG.getVectorIdxConstant(NumValueDWords, DL));
9391 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
9392 SDValue ValueDWords =
9393 NumValueDWords == 1
9394 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op, ZeroIdx)
9396 EVT::getVectorVT(C, MVT::i32, NumValueDWords), Op,
9397 ZeroIdx);
9398 SDValue Value = DAG.getNode(ISD::BITCAST, DL, VT, ValueDWords);
9399 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
9400 }
9401
9402 if (!Subtarget->hasDwordx3LoadStores() &&
9403 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
9404 EVT WidenedVT = EVT::getVectorVT(C, VT.getVectorElementType(), 4);
9405 EVT WidenedMemVT = EVT::getVectorVT(C, MemVT.getVectorElementType(), 4);
9406 MachineMemOperand *WidenedMMO = MF.getMachineMemOperand(MMO, 0, 16);
9407 SDVTList WidenedVTList = DAG.getVTList(WidenedVT, VTList.VTs[1]);
9408 SDValue Op = DAG.getMemIntrinsicNode(Opcode, DL, WidenedVTList, Ops,
9409 WidenedMemVT, WidenedMMO);
9411 DAG.getVectorIdxConstant(0, DL));
9412 return DAG.getMergeValues({Value, SDValue(Op.getNode(), 1)}, DL);
9413 }
9414
9415 return DAG.getMemIntrinsicNode(Opcode, DL, VTList, Ops, MemVT, MMO);
9416}
9417
9418SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG,
9419 bool ImageStore) const {
9420 EVT StoreVT = VData.getValueType();
9421
9422 // No change for f16 and legal vector D16 types.
9423 if (!StoreVT.isVector())
9424 return VData;
9425
9426 SDLoc DL(VData);
9427 unsigned NumElements = StoreVT.getVectorNumElements();
9428
9429 if (Subtarget->hasUnpackedD16VMem()) {
9430 // We need to unpack the packed data to store.
9431 EVT IntStoreVT = StoreVT.changeTypeToInteger();
9432 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
9433
9434 EVT EquivStoreVT =
9435 EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElements);
9436 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
9437 return DAG.UnrollVectorOp(ZExt.getNode());
9438 }
9439
9440 // The sq block of gfx8.1 does not estimate register use correctly for d16
9441 // image store instructions. The data operand is computed as if it were not a
9442 // d16 image instruction.
9443 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
9444 // Bitcast to i16
9445 EVT IntStoreVT = StoreVT.changeTypeToInteger();
9446 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
9447
9448 // Decompose into scalars
9450 DAG.ExtractVectorElements(IntVData, Elts);
9451
9452 // Group pairs of i16 into v2i16 and bitcast to i32
9453 SmallVector<SDValue, 4> PackedElts;
9454 for (unsigned I = 0; I < Elts.size() / 2; I += 1) {
9455 SDValue Pair =
9456 DAG.getBuildVector(MVT::v2i16, DL, {Elts[I * 2], Elts[I * 2 + 1]});
9457 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
9458 PackedElts.push_back(IntPair);
9459 }
9460 if ((NumElements % 2) == 1) {
9461 // Handle v3i16
9462 unsigned I = Elts.size() / 2;
9463 SDValue Pair = DAG.getBuildVector(MVT::v2i16, DL,
9464 {Elts[I * 2], DAG.getUNDEF(MVT::i16)});
9465 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
9466 PackedElts.push_back(IntPair);
9467 }
9468
9469 // Pad using UNDEF
9470 PackedElts.resize(Elts.size(), DAG.getUNDEF(MVT::i32));
9471
9472 // Build final vector
9473 EVT VecVT =
9474 EVT::getVectorVT(*DAG.getContext(), MVT::i32, PackedElts.size());
9475 return DAG.getBuildVector(VecVT, DL, PackedElts);
9476 }
9477
9478 if (NumElements == 3) {
9479 EVT IntStoreVT =
9481 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
9482
9483 EVT WidenedStoreVT = EVT::getVectorVT(
9484 *DAG.getContext(), StoreVT.getVectorElementType(), NumElements + 1);
9485 EVT WidenedIntVT = EVT::getIntegerVT(*DAG.getContext(),
9486 WidenedStoreVT.getStoreSizeInBits());
9487 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenedIntVT, IntVData);
9488 return DAG.getNode(ISD::BITCAST, DL, WidenedStoreVT, ZExt);
9489 }
9490
9491 assert(isTypeLegal(StoreVT));
9492 return VData;
9493}
9494
9495SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
9496 SelectionDAG &DAG) const {
9497 SDLoc DL(Op);
9498 SDValue Chain = Op.getOperand(0);
9499 unsigned IntrinsicID = Op.getConstantOperandVal(1);
9501
9502 switch (IntrinsicID) {
9503 case Intrinsic::amdgcn_exp_compr: {
9504 if (!Subtarget->hasCompressedExport()) {
9505 DiagnosticInfoUnsupported BadIntrin(
9507 "intrinsic not supported on subtarget", DL.getDebugLoc());
9508 DAG.getContext()->diagnose(BadIntrin);
9509 }
9510 SDValue Src0 = Op.getOperand(4);
9511 SDValue Src1 = Op.getOperand(5);
9512 // Hack around illegal type on SI by directly selecting it.
9513 if (isTypeLegal(Src0.getValueType()))
9514 return SDValue();
9515
9516 const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
9517 SDValue Undef = DAG.getUNDEF(MVT::f32);
9518 const SDValue Ops[] = {
9519 Op.getOperand(2), // tgt
9520 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0), // src0
9521 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1), // src1
9522 Undef, // src2
9523 Undef, // src3
9524 Op.getOperand(7), // vm
9525 DAG.getTargetConstant(1, DL, MVT::i1), // compr
9526 Op.getOperand(3), // en
9527 Op.getOperand(0) // Chain
9528 };
9529
9530 unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
9531 return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0);
9532 }
9533 case Intrinsic::amdgcn_s_barrier: {
9536 unsigned WGSize = ST.getFlatWorkGroupSizes(MF.getFunction()).second;
9537 if (WGSize <= ST.getWavefrontSize())
9538 return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL, MVT::Other,
9539 Op.getOperand(0)), 0);
9540 }
9541
9542 // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
9543 if (ST.hasSplitBarriers()) {
9544 SDValue K =
9546 SDValue BarSignal =
9547 SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_SIGNAL_IMM, DL,
9548 MVT::Other, K, Op.getOperand(0)),
9549 0);
9550 SDValue BarWait =
9551 SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_WAIT, DL, MVT::Other, K,
9552 BarSignal.getValue(0)),
9553 0);
9554 return BarWait;
9555 }
9556
9557 return SDValue();
9558 };
9559
9560 case Intrinsic::amdgcn_struct_tbuffer_store:
9561 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
9562 SDValue VData = Op.getOperand(2);
9563 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
9564 if (IsD16)
9565 VData = handleD16VData(VData, DAG);
9566 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9567 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
9568 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
9569 SDValue Ops[] = {
9570 Chain,
9571 VData, // vdata
9572 Rsrc, // rsrc
9573 Op.getOperand(4), // vindex
9574 Offsets.first, // voffset
9575 SOffset, // soffset
9576 Offsets.second, // offset
9577 Op.getOperand(7), // format
9578 Op.getOperand(8), // cachepolicy, swizzled buffer
9579 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9580 };
9581 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
9583 MemSDNode *M = cast<MemSDNode>(Op);
9584 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9585 M->getMemoryVT(), M->getMemOperand());
9586 }
9587
9588 case Intrinsic::amdgcn_raw_tbuffer_store:
9589 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
9590 SDValue VData = Op.getOperand(2);
9591 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
9592 if (IsD16)
9593 VData = handleD16VData(VData, DAG);
9594 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9595 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
9596 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9597 SDValue Ops[] = {
9598 Chain,
9599 VData, // vdata
9600 Rsrc, // rsrc
9601 DAG.getConstant(0, DL, MVT::i32), // vindex
9602 Offsets.first, // voffset
9603 SOffset, // soffset
9604 Offsets.second, // offset
9605 Op.getOperand(6), // format
9606 Op.getOperand(7), // cachepolicy, swizzled buffer
9607 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9608 };
9609 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
9611 MemSDNode *M = cast<MemSDNode>(Op);
9612 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9613 M->getMemoryVT(), M->getMemOperand());
9614 }
9615
9616 case Intrinsic::amdgcn_raw_buffer_store:
9617 case Intrinsic::amdgcn_raw_ptr_buffer_store:
9618 case Intrinsic::amdgcn_raw_buffer_store_format:
9619 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
9620 const bool IsFormat =
9621 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
9622 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
9623
9624 SDValue VData = Op.getOperand(2);
9625 EVT VDataVT = VData.getValueType();
9626 EVT EltType = VDataVT.getScalarType();
9627 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
9628 if (IsD16) {
9629 VData = handleD16VData(VData, DAG);
9630 VDataVT = VData.getValueType();
9631 }
9632
9633 if (!isTypeLegal(VDataVT)) {
9634 VData =
9635 DAG.getNode(ISD::BITCAST, DL,
9636 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
9637 }
9638
9639 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9640 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
9641 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9642 SDValue Ops[] = {
9643 Chain,
9644 VData,
9645 Rsrc,
9646 DAG.getConstant(0, DL, MVT::i32), // vindex
9647 Offsets.first, // voffset
9648 SOffset, // soffset
9649 Offsets.second, // offset
9650 Op.getOperand(6), // cachepolicy, swizzled buffer
9651 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9652 };
9653 unsigned Opc =
9655 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
9656 MemSDNode *M = cast<MemSDNode>(Op);
9657
9658 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
9659 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
9660 return handleByteShortBufferStores(DAG, VDataVT, DL, Ops, M);
9661
9662 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9663 M->getMemoryVT(), M->getMemOperand());
9664 }
9665
9666 case Intrinsic::amdgcn_struct_buffer_store:
9667 case Intrinsic::amdgcn_struct_ptr_buffer_store:
9668 case Intrinsic::amdgcn_struct_buffer_store_format:
9669 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
9670 const bool IsFormat =
9671 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
9672 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
9673
9674 SDValue VData = Op.getOperand(2);
9675 EVT VDataVT = VData.getValueType();
9676 EVT EltType = VDataVT.getScalarType();
9677 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
9678
9679 if (IsD16) {
9680 VData = handleD16VData(VData, DAG);
9681 VDataVT = VData.getValueType();
9682 }
9683
9684 if (!isTypeLegal(VDataVT)) {
9685 VData =
9686 DAG.getNode(ISD::BITCAST, DL,
9687 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
9688 }
9689
9690 auto Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9691 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
9692 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
9693 SDValue Ops[] = {
9694 Chain,
9695 VData,
9696 Rsrc,
9697 Op.getOperand(4), // vindex
9698 Offsets.first, // voffset
9699 SOffset, // soffset
9700 Offsets.second, // offset
9701 Op.getOperand(7), // cachepolicy, swizzled buffer
9702 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9703 };
9704 unsigned Opc =
9706 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
9707 MemSDNode *M = cast<MemSDNode>(Op);
9708
9709 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
9710 EVT VDataType = VData.getValueType().getScalarType();
9711 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
9712 return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
9713
9714 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9715 M->getMemoryVT(), M->getMemOperand());
9716 }
9717 case Intrinsic::amdgcn_raw_buffer_load_lds:
9718 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
9719 case Intrinsic::amdgcn_struct_buffer_load_lds:
9720 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
9721 assert(!AMDGPU::isGFX12Plus(*Subtarget));
9722 unsigned Opc;
9723 bool HasVIndex =
9724 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
9725 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
9726 unsigned OpOffset = HasVIndex ? 1 : 0;
9727 SDValue VOffset = Op.getOperand(5 + OpOffset);
9728 bool HasVOffset = !isNullConstant(VOffset);
9729 unsigned Size = Op->getConstantOperandVal(4);
9730
9731 switch (Size) {
9732 default:
9733 return SDValue();
9734 case 1:
9735 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
9736 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
9737 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
9738 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
9739 break;
9740 case 2:
9741 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
9742 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
9743 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
9744 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
9745 break;
9746 case 4:
9747 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
9748 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
9749 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
9750 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
9751 break;
9752 }
9753
9754 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
9755
9757
9758 if (HasVIndex && HasVOffset)
9759 Ops.push_back(DAG.getBuildVector(MVT::v2i32, DL,
9760 { Op.getOperand(5), // VIndex
9761 VOffset }));
9762 else if (HasVIndex)
9763 Ops.push_back(Op.getOperand(5));
9764 else if (HasVOffset)
9765 Ops.push_back(VOffset);
9766
9767 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
9768 Ops.push_back(Rsrc);
9769 Ops.push_back(Op.getOperand(6 + OpOffset)); // soffset
9770 Ops.push_back(Op.getOperand(7 + OpOffset)); // imm offset
9771 unsigned Aux = Op.getConstantOperandVal(8 + OpOffset);
9772 Ops.push_back(
9773 DAG.getTargetConstant(Aux & AMDGPU::CPol::ALL, DL, MVT::i8)); // cpol
9775 Aux & AMDGPU::CPol::SWZ_pregfx12 ? 1 : 0, DL, MVT::i8)); // swz
9776 Ops.push_back(M0Val.getValue(0)); // Chain
9777 Ops.push_back(M0Val.getValue(1)); // Glue
9778
9779 auto *M = cast<MemSDNode>(Op);
9780 MachineMemOperand *LoadMMO = M->getMemOperand();
9781 // Don't set the offset value here because the pointer points to the base of
9782 // the buffer.
9783 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
9784
9785 MachinePointerInfo StorePtrI = LoadPtrI;
9786 LoadPtrI.V = PoisonValue::get(
9790
9791 auto F = LoadMMO->getFlags() &
9793 LoadMMO =
9795 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
9796
9798 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t),
9799 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
9800
9801 auto Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops);
9802 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
9803
9804 return SDValue(Load, 0);
9805 }
9806 case Intrinsic::amdgcn_global_load_lds: {
9807 unsigned Opc;
9808 unsigned Size = Op->getConstantOperandVal(4);
9809 switch (Size) {
9810 default:
9811 return SDValue();
9812 case 1:
9813 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
9814 break;
9815 case 2:
9816 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
9817 break;
9818 case 4:
9819 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
9820 break;
9821 }
9822
9823 auto *M = cast<MemSDNode>(Op);
9824 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
9825
9827
9828 SDValue Addr = Op.getOperand(2); // Global ptr
9829 SDValue VOffset;
9830 // Try to split SAddr and VOffset. Global and LDS pointers share the same
9831 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
9832 if (Addr->isDivergent() && Addr.getOpcode() == ISD::ADD) {
9833 SDValue LHS = Addr.getOperand(0);
9834 SDValue RHS = Addr.getOperand(1);
9835
9836 if (LHS->isDivergent())
9837 std::swap(LHS, RHS);
9838
9839 if (!LHS->isDivergent() && RHS.getOpcode() == ISD::ZERO_EXTEND &&
9840 RHS.getOperand(0).getValueType() == MVT::i32) {
9841 // add (i64 sgpr), (zero_extend (i32 vgpr))
9842 Addr = LHS;
9843 VOffset = RHS.getOperand(0);
9844 }
9845 }
9846
9847 Ops.push_back(Addr);
9848 if (!Addr->isDivergent()) {
9849 Opc = AMDGPU::getGlobalSaddrOp(Opc);
9850 if (!VOffset)
9851 VOffset = SDValue(
9852 DAG.getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32,
9853 DAG.getTargetConstant(0, DL, MVT::i32)), 0);
9854 Ops.push_back(VOffset);
9855 }
9856
9857 Ops.push_back(Op.getOperand(5)); // Offset
9858 Ops.push_back(Op.getOperand(6)); // CPol
9859 Ops.push_back(M0Val.getValue(0)); // Chain
9860 Ops.push_back(M0Val.getValue(1)); // Glue
9861
9862 MachineMemOperand *LoadMMO = M->getMemOperand();
9863 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
9864 LoadPtrI.Offset = Op->getConstantOperandVal(5);
9865 MachinePointerInfo StorePtrI = LoadPtrI;
9866 LoadPtrI.V = PoisonValue::get(
9870 auto F = LoadMMO->getFlags() &
9872 LoadMMO =
9874 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
9876 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t), Align(4),
9877 LoadMMO->getAAInfo());
9878
9879 auto Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
9880 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
9881
9882 return SDValue(Load, 0);
9883 }
9884 case Intrinsic::amdgcn_end_cf:
9885 return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
9886 Op->getOperand(2), Chain), 0);
9887 case Intrinsic::amdgcn_s_barrier_init:
9888 case Intrinsic::amdgcn_s_barrier_join:
9889 case Intrinsic::amdgcn_s_wakeup_barrier: {
9890 SDValue Chain = Op->getOperand(0);
9892 SDValue BarOp = Op->getOperand(2);
9893 unsigned Opc;
9894 bool IsInlinableBarID = false;
9895 int64_t BarVal;
9896
9897 if (isa<ConstantSDNode>(BarOp)) {
9898 BarVal = cast<ConstantSDNode>(BarOp)->getSExtValue();
9899 IsInlinableBarID = AMDGPU::isInlinableIntLiteral(BarVal);
9900 }
9901
9902 if (IsInlinableBarID) {
9903 switch (IntrinsicID) {
9904 default:
9905 return SDValue();
9906 case Intrinsic::amdgcn_s_barrier_init:
9907 Opc = AMDGPU::S_BARRIER_INIT_IMM;
9908 break;
9909 case Intrinsic::amdgcn_s_barrier_join:
9910 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
9911 break;
9912 case Intrinsic::amdgcn_s_wakeup_barrier:
9913 Opc = AMDGPU::S_WAKEUP_BARRIER_IMM;
9914 break;
9915 }
9916
9917 SDValue K = DAG.getTargetConstant(BarVal, DL, MVT::i32);
9918 Ops.push_back(K);
9919 } else {
9920 switch (IntrinsicID) {
9921 default:
9922 return SDValue();
9923 case Intrinsic::amdgcn_s_barrier_init:
9924 Opc = AMDGPU::S_BARRIER_INIT_M0;
9925 break;
9926 case Intrinsic::amdgcn_s_barrier_join:
9927 Opc = AMDGPU::S_BARRIER_JOIN_M0;
9928 break;
9929 case Intrinsic::amdgcn_s_wakeup_barrier:
9930 Opc = AMDGPU::S_WAKEUP_BARRIER_M0;
9931 break;
9932 }
9933 }
9934
9935 if (IntrinsicID == Intrinsic::amdgcn_s_barrier_init) {
9936 SDValue M0Val;
9937 // Member count will be read from M0[16:22]
9938 M0Val = DAG.getNode(ISD::SHL, DL, MVT::i32, Op.getOperand(3),
9939 DAG.getShiftAmountConstant(16, MVT::i32, DL));
9940
9941 if (!IsInlinableBarID) {
9942 // If reference to barrier id is not an inline constant then it must be
9943 // referenced with M0[4:0]. Perform an OR with the member count to
9944 // include it in M0.
9945 M0Val = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32,
9946 Op.getOperand(2), M0Val),
9947 0);
9948 }
9949 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
9950 } else if (!IsInlinableBarID) {
9951 Ops.push_back(copyToM0(DAG, Chain, DL, BarOp).getValue(0));
9952 }
9953
9954 auto NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
9955 return SDValue(NewMI, 0);
9956 }
9957 default: {
9958 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
9960 return lowerImage(Op, ImageDimIntr, DAG, true);
9961
9962 return Op;
9963 }
9964 }
9965}
9966
9967// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
9968// offset (the offset that is included in bounds checking and swizzling, to be
9969// split between the instruction's voffset and immoffset fields) and soffset
9970// (the offset that is excluded from bounds checking and swizzling, to go in
9971// the instruction's soffset field). This function takes the first kind of
9972// offset and figures out how to split it between voffset and immoffset.
9973std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets(
9974 SDValue Offset, SelectionDAG &DAG) const {
9975 SDLoc DL(Offset);
9976 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
9977 SDValue N0 = Offset;
9978 ConstantSDNode *C1 = nullptr;
9979
9980 if ((C1 = dyn_cast<ConstantSDNode>(N0)))
9981 N0 = SDValue();
9982 else if (DAG.isBaseWithConstantOffset(N0)) {
9983 C1 = cast<ConstantSDNode>(N0.getOperand(1));
9984 N0 = N0.getOperand(0);
9985 }
9986
9987 if (C1) {
9988 unsigned ImmOffset = C1->getZExtValue();
9989 // If the immediate value is too big for the immoffset field, put only bits
9990 // that would normally fit in the immoffset field. The remaining value that
9991 // is copied/added for the voffset field is a large power of 2, and it
9992 // stands more chance of being CSEd with the copy/add for another similar
9993 // load/store.
9994 // However, do not do that rounding down if that is a negative
9995 // number, as it appears to be illegal to have a negative offset in the
9996 // vgpr, even if adding the immediate offset makes it positive.
9997 unsigned Overflow = ImmOffset & ~MaxImm;
9998 ImmOffset -= Overflow;
9999 if ((int32_t)Overflow < 0) {
10000 Overflow += ImmOffset;
10001 ImmOffset = 0;
10002 }
10003 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(ImmOffset, DL, MVT::i32));
10004 if (Overflow) {
10005 auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32);
10006 if (!N0)
10007 N0 = OverflowVal;
10008 else {
10009 SDValue Ops[] = { N0, OverflowVal };
10010 N0 = DAG.getNode(ISD::ADD, DL, MVT::i32, Ops);
10011 }
10012 }
10013 }
10014 if (!N0)
10015 N0 = DAG.getConstant(0, DL, MVT::i32);
10016 if (!C1)
10017 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(0, DL, MVT::i32));
10018 return {N0, SDValue(C1, 0)};
10019}
10020
10021// Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store
10022// the three offsets (voffset, soffset and instoffset) into the SDValue[3] array
10023// pointed to by Offsets.
10024void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
10025 SelectionDAG &DAG, SDValue *Offsets,
10026 Align Alignment) const {
10028 SDLoc DL(CombinedOffset);
10029 if (auto *C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
10030 uint32_t Imm = C->getZExtValue();
10031 uint32_t SOffset, ImmOffset;
10032 if (TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
10033 Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
10034 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
10035 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
10036 return;
10037 }
10038 }
10039 if (DAG.isBaseWithConstantOffset(CombinedOffset)) {
10040 SDValue N0 = CombinedOffset.getOperand(0);
10041 SDValue N1 = CombinedOffset.getOperand(1);
10042 uint32_t SOffset, ImmOffset;
10043 int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
10044 if (Offset >= 0 &&
10045 TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) {
10046 Offsets[0] = N0;
10047 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
10048 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
10049 return;
10050 }
10051 }
10052
10053 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
10054 ? DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32)
10055 : DAG.getConstant(0, DL, MVT::i32);
10056
10057 Offsets[0] = CombinedOffset;
10058 Offsets[1] = SOffsetZero;
10059 Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32);
10060}
10061
10062SDValue SITargetLowering::bufferRsrcPtrToVector(SDValue MaybePointer,
10063 SelectionDAG &DAG) const {
10064 if (!MaybePointer.getValueType().isScalarInteger())
10065 return MaybePointer;
10066
10067 SDLoc DL(MaybePointer);
10068
10069 SDValue Rsrc = DAG.getBitcast(MVT::v4i32, MaybePointer);
10070 return Rsrc;
10071}
10072
10073// Wrap a global or flat pointer into a buffer intrinsic using the flags
10074// specified in the intrinsic.
10075SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op,
10076 SelectionDAG &DAG) const {
10077 SDLoc Loc(Op);
10078
10079 SDValue Pointer = Op->getOperand(1);
10080 SDValue Stride = Op->getOperand(2);
10081 SDValue NumRecords = Op->getOperand(3);
10082 SDValue Flags = Op->getOperand(4);
10083
10084 auto [LowHalf, HighHalf] = DAG.SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
10085 SDValue Mask = DAG.getConstant(0x0000ffff, Loc, MVT::i32);
10086 SDValue Masked = DAG.getNode(ISD::AND, Loc, MVT::i32, HighHalf, Mask);
10087 std::optional<uint32_t> ConstStride = std::nullopt;
10088 if (auto *ConstNode = dyn_cast<ConstantSDNode>(Stride))
10089 ConstStride = ConstNode->getZExtValue();
10090
10091 SDValue NewHighHalf = Masked;
10092 if (!ConstStride || *ConstStride != 0) {
10093 SDValue ShiftedStride;
10094 if (ConstStride) {
10095 ShiftedStride = DAG.getConstant(*ConstStride << 16, Loc, MVT::i32);
10096 } else {
10097 SDValue ExtStride = DAG.getAnyExtOrTrunc(Stride, Loc, MVT::i32);
10098 ShiftedStride =
10099 DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride,
10100 DAG.getShiftAmountConstant(16, MVT::i32, Loc));
10101 }
10102 NewHighHalf = DAG.getNode(ISD::OR, Loc, MVT::i32, Masked, ShiftedStride);
10103 }
10104
10105 SDValue Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v4i32, LowHalf,
10106 NewHighHalf, NumRecords, Flags);
10107 SDValue RsrcPtr = DAG.getNode(ISD::BITCAST, Loc, MVT::i128, Rsrc);
10108 return RsrcPtr;
10109}
10110
10111// Handle 8 bit and 16 bit buffer loads
10112SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG,
10113 EVT LoadVT, SDLoc DL,
10115 MachineMemOperand *MMO,
10116 bool IsTFE) const {
10117 EVT IntVT = LoadVT.changeTypeToInteger();
10118
10119 if (IsTFE) {
10120 unsigned Opc = (LoadVT.getScalarType() == MVT::i8)
10124 MachineMemOperand *OpMMO = MF.getMachineMemOperand(MMO, 0, 8);
10125 SDVTList VTs = DAG.getVTList(MVT::v2i32, MVT::Other);
10126 SDValue Op = getMemIntrinsicNode(Opc, DL, VTs, Ops, MVT::v2i32, OpMMO, DAG);
10128 DAG.getConstant(1, DL, MVT::i32));
10129 SDValue Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
10130 DAG.getConstant(0, DL, MVT::i32));
10131 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Data);
10132 SDValue Value = DAG.getNode(ISD::BITCAST, DL, LoadVT, Trunc);
10133 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
10134 }
10135
10136 unsigned Opc = (LoadVT.getScalarType() == MVT::i8) ?
10138
10139 SDVTList ResList = DAG.getVTList(MVT::i32, MVT::Other);
10140 SDValue BufferLoad =
10141 DAG.getMemIntrinsicNode(Opc, DL, ResList, Ops, IntVT, MMO);
10142 SDValue LoadVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, BufferLoad);
10143 LoadVal = DAG.getNode(ISD::BITCAST, DL, LoadVT, LoadVal);
10144
10145 return DAG.getMergeValues({LoadVal, BufferLoad.getValue(1)}, DL);
10146}
10147
10148// Handle 8 bit and 16 bit buffer stores
10149SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG,
10150 EVT VDataType, SDLoc DL,
10151 SDValue Ops[],
10152 MemSDNode *M) const {
10153 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
10154 Ops[1] = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Ops[1]);
10155
10156 SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]);
10157 Ops[1] = BufferStoreExt;
10158 unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE :
10160 ArrayRef<SDValue> OpsRef = ArrayRef(&Ops[0], 9);
10161 return DAG.getMemIntrinsicNode(Opc, DL, M->getVTList(), OpsRef, VDataType,
10162 M->getMemOperand());
10163}
10164
10166 ISD::LoadExtType ExtType, SDValue Op,
10167 const SDLoc &SL, EVT VT) {
10168 if (VT.bitsLT(Op.getValueType()))
10169 return DAG.getNode(ISD::TRUNCATE, SL, VT, Op);
10170
10171 switch (ExtType) {
10172 case ISD::SEXTLOAD:
10173 return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op);
10174 case ISD::ZEXTLOAD:
10175 return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op);
10176 case ISD::EXTLOAD:
10177 return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op);
10178 case ISD::NON_EXTLOAD:
10179 return Op;
10180 }
10181
10182 llvm_unreachable("invalid ext type");
10183}
10184
10185// Try to turn 8 and 16-bit scalar loads into SMEM eligible 32-bit loads.
10186// TODO: Skip this on GFX12 which does have scalar sub-dword loads.
10187SDValue SITargetLowering::widenLoad(LoadSDNode *Ld, DAGCombinerInfo &DCI) const {
10188 SelectionDAG &DAG = DCI.DAG;
10189 if (Ld->getAlign() < Align(4) || Ld->isDivergent())
10190 return SDValue();
10191
10192 // FIXME: Constant loads should all be marked invariant.
10193 unsigned AS = Ld->getAddressSpace();
10194 if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
10196 (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
10197 return SDValue();
10198
10199 // Don't do this early, since it may interfere with adjacent load merging for
10200 // illegal types. We can avoid losing alignment information for exotic types
10201 // pre-legalize.
10202 EVT MemVT = Ld->getMemoryVT();
10203 if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
10204 MemVT.getSizeInBits() >= 32)
10205 return SDValue();
10206
10207 SDLoc SL(Ld);
10208
10209 assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
10210 "unexpected vector extload");
10211
10212 // TODO: Drop only high part of range.
10213 SDValue Ptr = Ld->getBasePtr();
10214 SDValue NewLoad = DAG.getLoad(
10215 ISD::UNINDEXED, ISD::NON_EXTLOAD, MVT::i32, SL, Ld->getChain(), Ptr,
10216 Ld->getOffset(), Ld->getPointerInfo(), MVT::i32, Ld->getAlign(),
10217 Ld->getMemOperand()->getFlags(), Ld->getAAInfo(),
10218 nullptr); // Drop ranges
10219
10220 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
10221 if (MemVT.isFloatingPoint()) {
10223 "unexpected fp extload");
10224 TruncVT = MemVT.changeTypeToInteger();
10225 }
10226
10227 SDValue Cvt = NewLoad;
10228 if (Ld->getExtensionType() == ISD::SEXTLOAD) {
10229 Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad,
10230 DAG.getValueType(TruncVT));
10231 } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
10233 Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT);
10234 } else {
10236 }
10237
10238 EVT VT = Ld->getValueType(0);
10239 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
10240
10241 DCI.AddToWorklist(Cvt.getNode());
10242
10243 // We may need to handle exotic cases, such as i16->i64 extloads, so insert
10244 // the appropriate extension from the 32-bit load.
10245 Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT);
10246 DCI.AddToWorklist(Cvt.getNode());
10247
10248 // Handle conversion back to floating point if necessary.
10249 Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt);
10250
10251 return DAG.getMergeValues({ Cvt, NewLoad.getValue(1) }, SL);
10252}
10253
10255 const SIMachineFunctionInfo &Info) {
10256 // TODO: Should check if the address can definitely not access stack.
10257 if (Info.isEntryFunction())
10258 return Info.getUserSGPRInfo().hasFlatScratchInit();
10259 return true;
10260}
10261
10262SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
10263 SDLoc DL(Op);
10264 LoadSDNode *Load = cast<LoadSDNode>(Op);
10265 ISD::LoadExtType ExtType = Load->getExtensionType();
10266 EVT MemVT = Load->getMemoryVT();
10267
10268 if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
10269 if (MemVT == MVT::i16 && isTypeLegal(MVT::i16))
10270 return SDValue();
10271
10272 // FIXME: Copied from PPC
10273 // First, load into 32 bits, then truncate to 1 bit.
10274
10275 SDValue Chain = Load->getChain();
10276 SDValue BasePtr = Load->getBasePtr();
10277 MachineMemOperand *MMO = Load->getMemOperand();
10278
10279 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
10280
10281 SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain,
10282 BasePtr, RealMemVT, MMO);
10283
10284 if (!MemVT.isVector()) {
10285 SDValue Ops[] = {
10286 DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
10287 NewLD.getValue(1)
10288 };
10289
10290 return DAG.getMergeValues(Ops, DL);
10291 }
10292
10294 for (unsigned I = 0, N = MemVT.getVectorNumElements(); I != N; ++I) {
10295 SDValue Elt = DAG.getNode(ISD::SRL, DL, MVT::i32, NewLD,
10296 DAG.getConstant(I, DL, MVT::i32));
10297
10298 Elts.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Elt));
10299 }
10300
10301 SDValue Ops[] = {
10302 DAG.getBuildVector(MemVT, DL, Elts),
10303 NewLD.getValue(1)
10304 };
10305
10306 return DAG.getMergeValues(Ops, DL);
10307 }
10308
10309 if (!MemVT.isVector())
10310 return SDValue();
10311
10312 assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
10313 "Custom lowering for non-i32 vectors hasn't been implemented.");
10314
10315 Align Alignment = Load->getAlign();
10316 unsigned AS = Load->getAddressSpace();
10317 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
10318 Alignment.value() < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) {
10319 return SplitVectorLoad(Op, DAG);
10320 }
10321
10324 // If there is a possibility that flat instruction access scratch memory
10325 // then we need to use the same legalization rules we use for private.
10326 if (AS == AMDGPUAS::FLAT_ADDRESS &&
10328 AS = addressMayBeAccessedAsPrivate(Load->getMemOperand(), *MFI) ?
10330
10331 unsigned NumElements = MemVT.getVectorNumElements();
10332
10333 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
10335 if (!Op->isDivergent() && Alignment >= Align(4) && NumElements < 32) {
10336 if (MemVT.isPow2VectorType() ||
10337 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
10338 return SDValue();
10339 return WidenOrSplitVectorLoad(Op, DAG);
10340 }
10341 // Non-uniform loads will be selected to MUBUF instructions, so they
10342 // have the same legalization requirements as global and private
10343 // loads.
10344 //
10345 }
10346
10347 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
10350 if (Subtarget->getScalarizeGlobalBehavior() && !Op->isDivergent() &&
10351 Load->isSimple() && isMemOpHasNoClobberedMemOperand(Load) &&
10352 Alignment >= Align(4) && NumElements < 32) {
10353 if (MemVT.isPow2VectorType() ||
10354 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
10355 return SDValue();
10356 return WidenOrSplitVectorLoad(Op, DAG);
10357 }
10358 // Non-uniform loads will be selected to MUBUF instructions, so they
10359 // have the same legalization requirements as global and private
10360 // loads.
10361 //
10362 }
10363 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
10366 AS == AMDGPUAS::FLAT_ADDRESS) {
10367 if (NumElements > 4)
10368 return SplitVectorLoad(Op, DAG);
10369 // v3 loads not supported on SI.
10370 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
10371 return WidenOrSplitVectorLoad(Op, DAG);
10372
10373 // v3 and v4 loads are supported for private and global memory.
10374 return SDValue();
10375 }
10376 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
10377 // Depending on the setting of the private_element_size field in the
10378 // resource descriptor, we can only make private accesses up to a certain
10379 // size.
10380 switch (Subtarget->getMaxPrivateElementSize()) {
10381 case 4: {
10382 SDValue Ops[2];
10383 std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG);
10384 return DAG.getMergeValues(Ops, DL);
10385 }
10386 case 8:
10387 if (NumElements > 2)
10388 return SplitVectorLoad(Op, DAG);
10389 return SDValue();
10390 case 16:
10391 // Same as global/flat
10392 if (NumElements > 4)
10393 return SplitVectorLoad(Op, DAG);
10394 // v3 loads not supported on SI.
10395 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
10396 return WidenOrSplitVectorLoad(Op, DAG);
10397
10398 return SDValue();
10399 default:
10400 llvm_unreachable("unsupported private_element_size");
10401 }
10402 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
10403 unsigned Fast = 0;
10404 auto Flags = Load->getMemOperand()->getFlags();
10406 Load->getAlign(), Flags, &Fast) &&
10407 Fast > 1)
10408 return SDValue();
10409
10410 if (MemVT.isVector())
10411 return SplitVectorLoad(Op, DAG);
10412 }
10413
10415 MemVT, *Load->getMemOperand())) {
10416 SDValue Ops[2];
10417 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
10418 return DAG.getMergeValues(Ops, DL);
10419 }
10420
10421 return SDValue();
10422}
10423
10424SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
10425 EVT VT = Op.getValueType();
10426 if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256 ||
10427 VT.getSizeInBits() == 512)
10428 return splitTernaryVectorOp(Op, DAG);
10429
10430 assert(VT.getSizeInBits() == 64);
10431
10432 SDLoc DL(Op);
10433 SDValue Cond = Op.getOperand(0);
10434
10435 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
10436 SDValue One = DAG.getConstant(1, DL, MVT::i32);
10437
10438 SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
10439 SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
10440
10441 SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
10442 SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
10443
10444 SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
10445
10446 SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
10447 SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
10448
10449 SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
10450
10451 SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
10452 return DAG.getNode(ISD::BITCAST, DL, VT, Res);
10453}
10454
10455// Catch division cases where we can use shortcuts with rcp and rsq
10456// instructions.
10457SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
10458 SelectionDAG &DAG) const {
10459 SDLoc SL(Op);
10460 SDValue LHS = Op.getOperand(0);
10461 SDValue RHS = Op.getOperand(1);
10462 EVT VT = Op.getValueType();
10463 const SDNodeFlags Flags = Op->getFlags();
10464
10465 bool AllowInaccurateRcp = Flags.hasApproximateFuncs() ||
10467
10468 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
10469 // Without !fpmath accuracy information, we can't do more because we don't
10470 // know exactly whether rcp is accurate enough to meet !fpmath requirement.
10471 // f16 is always accurate enough
10472 if (!AllowInaccurateRcp && VT != MVT::f16)
10473 return SDValue();
10474
10475 if (CLHS->isExactlyValue(1.0)) {
10476 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
10477 // the CI documentation has a worst case error of 1 ulp.
10478 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
10479 // use it as long as we aren't trying to use denormals.
10480 //
10481 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
10482
10483 // 1.0 / sqrt(x) -> rsq(x)
10484
10485 // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
10486 // error seems really high at 2^29 ULP.
10487 // 1.0 / x -> rcp(x)
10488 return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
10489 }
10490
10491 // Same as for 1.0, but expand the sign out of the constant.
10492 if (CLHS->isExactlyValue(-1.0)) {
10493 // -1.0 / x -> rcp (fneg x)
10494 SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
10495 return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
10496 }
10497 }
10498
10499 // For f16 require afn or arcp.
10500 // For f32 require afn.
10501 if (!AllowInaccurateRcp && (VT != MVT::f16 || !Flags.hasAllowReciprocal()))
10502 return SDValue();
10503
10504 // Turn into multiply by the reciprocal.
10505 // x / y -> x * (1.0 / y)
10506 SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
10507 return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
10508}
10509
10510SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op,
10511 SelectionDAG &DAG) const {
10512 SDLoc SL(Op);
10513 SDValue X = Op.getOperand(0);
10514 SDValue Y = Op.getOperand(1);
10515 EVT VT = Op.getValueType();
10516 const SDNodeFlags Flags = Op->getFlags();
10517
10518 bool AllowInaccurateDiv = Flags.hasApproximateFuncs() ||
10520 if (!AllowInaccurateDiv)
10521 return SDValue();
10522
10523 SDValue NegY = DAG.getNode(ISD::FNEG, SL, VT, Y);
10524 SDValue One = DAG.getConstantFP(1.0, SL, VT);
10525
10526 SDValue R = DAG.getNode(AMDGPUISD::RCP, SL, VT, Y);
10527 SDValue Tmp0 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
10528
10529 R = DAG.getNode(ISD::FMA, SL, VT, Tmp0, R, R);
10530 SDValue Tmp1 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
10531 R = DAG.getNode(ISD::FMA, SL, VT, Tmp1, R, R);
10532 SDValue Ret = DAG.getNode(ISD::FMUL, SL, VT, X, R);
10533 SDValue Tmp2 = DAG.getNode(ISD::FMA, SL, VT, NegY, Ret, X);
10534 return DAG.getNode(ISD::FMA, SL, VT, Tmp2, R, Ret);
10535}
10536
10537static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
10538 EVT VT, SDValue A, SDValue B, SDValue GlueChain,
10539 SDNodeFlags Flags) {
10540 if (GlueChain->getNumValues() <= 1) {
10541 return DAG.getNode(Opcode, SL, VT, A, B, Flags);
10542 }
10543
10544 assert(GlueChain->getNumValues() == 3);
10545
10546 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
10547 switch (Opcode) {
10548 default: llvm_unreachable("no chain equivalent for opcode");
10549 case ISD::FMUL:
10550 Opcode = AMDGPUISD::FMUL_W_CHAIN;
10551 break;
10552 }
10553
10554 return DAG.getNode(Opcode, SL, VTList,
10555 {GlueChain.getValue(1), A, B, GlueChain.getValue(2)},
10556 Flags);
10557}
10558
10559static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
10560 EVT VT, SDValue A, SDValue B, SDValue C,
10561 SDValue GlueChain, SDNodeFlags Flags) {
10562 if (GlueChain->getNumValues() <= 1) {
10563 return DAG.getNode(Opcode, SL, VT, {A, B, C}, Flags);
10564 }
10565
10566 assert(GlueChain->getNumValues() == 3);
10567
10568 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
10569 switch (Opcode) {
10570 default: llvm_unreachable("no chain equivalent for opcode");
10571 case ISD::FMA:
10572 Opcode = AMDGPUISD::FMA_W_CHAIN;
10573 break;
10574 }
10575
10576 return DAG.getNode(Opcode, SL, VTList,
10577 {GlueChain.getValue(1), A, B, C, GlueChain.getValue(2)},
10578 Flags);
10579}
10580
10581SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
10582 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
10583 return FastLowered;
10584
10585 SDLoc SL(Op);
10586 SDValue Src0 = Op.getOperand(0);
10587 SDValue Src1 = Op.getOperand(1);
10588
10589 SDValue CvtSrc0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
10590 SDValue CvtSrc1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
10591
10592 SDValue RcpSrc1 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, CvtSrc1);
10593 SDValue Quot = DAG.getNode(ISD::FMUL, SL, MVT::f32, CvtSrc0, RcpSrc1);
10594
10595 SDValue FPRoundFlag = DAG.getTargetConstant(0, SL, MVT::i32);
10596 SDValue BestQuot = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot, FPRoundFlag);
10597
10598 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, BestQuot, Src1, Src0);
10599}
10600
10601// Faster 2.5 ULP division that does not support denormals.
10602SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
10603 SDNodeFlags Flags = Op->getFlags();
10604 SDLoc SL(Op);
10605 SDValue LHS = Op.getOperand(1);
10606 SDValue RHS = Op.getOperand(2);
10607
10608 SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS, Flags);
10609
10610 const APFloat K0Val(0x1p+96f);
10611 const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
10612
10613 const APFloat K1Val(0x1p-32f);
10614 const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
10615
10616 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
10617
10618 EVT SetCCVT =
10619 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
10620
10621 SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
10622
10623 SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One, Flags);
10624
10625 r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3, Flags);
10626
10627 // rcp does not support denormals.
10628 SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1, Flags);
10629
10630 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0, Flags);
10631
10632 return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul, Flags);
10633}
10634
10635// Returns immediate value for setting the F32 denorm mode when using the
10636// S_DENORM_MODE instruction.
10638 const SIMachineFunctionInfo *Info,
10639 const GCNSubtarget *ST) {
10640 assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE");
10641 uint32_t DPDenormModeDefault = Info->getMode().fpDenormModeDPValue();
10642 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
10643 return DAG.getTargetConstant(Mode, SDLoc(), MVT::i32);
10644}
10645
10646SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
10647 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
10648 return FastLowered;
10649
10650 // The selection matcher assumes anything with a chain selecting to a
10651 // mayRaiseFPException machine instruction. Since we're introducing a chain
10652 // here, we need to explicitly report nofpexcept for the regular fdiv
10653 // lowering.
10654 SDNodeFlags Flags = Op->getFlags();
10655 Flags.setNoFPExcept(true);
10656
10657 SDLoc SL(Op);
10658 SDValue LHS = Op.getOperand(0);
10659 SDValue RHS = Op.getOperand(1);
10660
10661 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
10662
10663 SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
10664
10665 SDValue DenominatorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
10666 {RHS, RHS, LHS}, Flags);
10667 SDValue NumeratorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
10668 {LHS, RHS, LHS}, Flags);
10669
10670 // Denominator is scaled to not be denormal, so using rcp is ok.
10671 SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32,
10672 DenominatorScaled, Flags);
10673 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32,
10674 DenominatorScaled, Flags);
10675
10676 using namespace AMDGPU::Hwreg;
10677 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
10678 const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i32);
10679
10680 const MachineFunction &MF = DAG.getMachineFunction();
10682 const DenormalMode DenormMode = Info->getMode().FP32Denormals;
10683
10684 const bool PreservesDenormals = DenormMode == DenormalMode::getIEEE();
10685 const bool HasDynamicDenormals =
10686 (DenormMode.Input == DenormalMode::Dynamic) ||
10687 (DenormMode.Output == DenormalMode::Dynamic);
10688
10689 SDValue SavedDenormMode;
10690
10691 if (!PreservesDenormals) {
10692 // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV
10693 // lowering. The chain dependence is insufficient, and we need glue. We do
10694 // not need the glue variants in a strictfp function.
10695
10696 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
10697
10698 SDValue Glue = DAG.getEntryNode();
10699 if (HasDynamicDenormals) {
10700 SDNode *GetReg = DAG.getMachineNode(AMDGPU::S_GETREG_B32, SL,
10701 DAG.getVTList(MVT::i32, MVT::Glue),
10702 {BitField, Glue});
10703 SavedDenormMode = SDValue(GetReg, 0);
10704
10705 Glue = DAG.getMergeValues(
10706 {DAG.getEntryNode(), SDValue(GetReg, 0), SDValue(GetReg, 1)}, SL);
10707 }
10708
10709 SDNode *EnableDenorm;
10710 if (Subtarget->hasDenormModeInst()) {
10711 const SDValue EnableDenormValue =
10712 getSPDenormModeValue(FP_DENORM_FLUSH_NONE, DAG, Info, Subtarget);
10713
10714 EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, Glue,
10715 EnableDenormValue)
10716 .getNode();
10717 } else {
10718 const SDValue EnableDenormValue = DAG.getConstant(FP_DENORM_FLUSH_NONE,
10719 SL, MVT::i32);
10720 EnableDenorm = DAG.getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
10721 {EnableDenormValue, BitField, Glue});
10722 }
10723
10724 SDValue Ops[3] = {
10725 NegDivScale0,
10726 SDValue(EnableDenorm, 0),
10727 SDValue(EnableDenorm, 1)
10728 };
10729
10730 NegDivScale0 = DAG.getMergeValues(Ops, SL);
10731 }
10732
10733 SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
10734 ApproxRcp, One, NegDivScale0, Flags);
10735
10736 SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
10737 ApproxRcp, Fma0, Flags);
10738
10739 SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled,
10740 Fma1, Fma1, Flags);
10741
10742 SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
10743 NumeratorScaled, Mul, Flags);
10744
10745 SDValue Fma3 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32,
10746 Fma2, Fma1, Mul, Fma2, Flags);
10747
10748 SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
10749 NumeratorScaled, Fma3, Flags);
10750
10751 if (!PreservesDenormals) {
10752 SDNode *DisableDenorm;
10753 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
10754 const SDValue DisableDenormValue = getSPDenormModeValue(
10755 FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, Info, Subtarget);
10756
10757 DisableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, MVT::Other,
10758 Fma4.getValue(1), DisableDenormValue,
10759 Fma4.getValue(2)).getNode();
10760 } else {
10761 assert(HasDynamicDenormals == (bool)SavedDenormMode);
10762 const SDValue DisableDenormValue =
10763 HasDynamicDenormals
10764 ? SavedDenormMode
10765 : DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
10766
10767 DisableDenorm = DAG.getMachineNode(
10768 AMDGPU::S_SETREG_B32, SL, MVT::Other,
10769 {DisableDenormValue, BitField, Fma4.getValue(1), Fma4.getValue(2)});
10770 }
10771
10772 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
10773 SDValue(DisableDenorm, 0), DAG.getRoot());
10774 DAG.setRoot(OutputChain);
10775 }
10776
10777 SDValue Scale = NumeratorScaled.getValue(1);
10778 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
10779 {Fma4, Fma1, Fma3, Scale}, Flags);
10780
10781 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS, Flags);
10782}
10783
10784SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
10785 if (SDValue FastLowered = lowerFastUnsafeFDIV64(Op, DAG))
10786 return FastLowered;
10787
10788 SDLoc SL(Op);
10789 SDValue X = Op.getOperand(0);
10790 SDValue Y = Op.getOperand(1);
10791
10792 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
10793
10794 SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
10795
10796 SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
10797
10798 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
10799
10800 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
10801
10802 SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
10803
10804 SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
10805
10806 SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
10807
10808 SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
10809
10810 SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
10811 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
10812
10813 SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f64,
10814 NegDivScale0, Mul, DivScale1);
10815
10816 SDValue Scale;
10817
10818 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
10819 // Workaround a hardware bug on SI where the condition output from div_scale
10820 // is not usable.
10821
10822 const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
10823
10824 // Figure out if the scale to use for div_fmas.
10825 SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
10826 SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
10827 SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
10828 SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
10829
10830 SDValue NumHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
10831 SDValue DenHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
10832
10833 SDValue Scale0Hi
10834 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
10835 SDValue Scale1Hi
10836 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
10837
10838 SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
10839 SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
10840 Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
10841 } else {
10842 Scale = DivScale1.getValue(1);
10843 }
10844
10845 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64,
10846 Fma4, Fma3, Mul, Scale);
10847
10848 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
10849}
10850
10851SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
10852 EVT VT = Op.getValueType();
10853
10854 if (VT == MVT::f32)
10855 return LowerFDIV32(Op, DAG);
10856
10857 if (VT == MVT::f64)
10858 return LowerFDIV64(Op, DAG);
10859
10860 if (VT == MVT::f16)
10861 return LowerFDIV16(Op, DAG);
10862
10863 llvm_unreachable("Unexpected type for fdiv");
10864}
10865
10866SDValue SITargetLowering::LowerFFREXP(SDValue Op, SelectionDAG &DAG) const {
10867 SDLoc dl(Op);
10868 SDValue Val = Op.getOperand(0);
10869 EVT VT = Val.getValueType();
10870 EVT ResultExpVT = Op->getValueType(1);
10871 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
10872
10873 SDValue Mant = DAG.getNode(
10875 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_mant, dl, MVT::i32), Val);
10876
10877 SDValue Exp = DAG.getNode(
10878 ISD::INTRINSIC_WO_CHAIN, dl, InstrExpVT,
10879 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_exp, dl, MVT::i32), Val);
10880
10881 if (Subtarget->hasFractBug()) {
10882 SDValue Fabs = DAG.getNode(ISD::FABS, dl, VT, Val);
10883 SDValue Inf =
10885
10886 SDValue IsFinite = DAG.getSetCC(dl, MVT::i1, Fabs, Inf, ISD::SETOLT);
10887 SDValue Zero = DAG.getConstant(0, dl, InstrExpVT);
10888 Exp = DAG.getNode(ISD::SELECT, dl, InstrExpVT, IsFinite, Exp, Zero);
10889 Mant = DAG.getNode(ISD::SELECT, dl, VT, IsFinite, Mant, Val);
10890 }
10891
10892 SDValue CastExp = DAG.getSExtOrTrunc(Exp, dl, ResultExpVT);
10893 return DAG.getMergeValues({Mant, CastExp}, dl);
10894}
10895
10896SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
10897 SDLoc DL(Op);
10898 StoreSDNode *Store = cast<StoreSDNode>(Op);
10899 EVT VT = Store->getMemoryVT();
10900
10901 if (VT == MVT::i1) {
10902 return DAG.getTruncStore(Store->getChain(), DL,
10903 DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
10904 Store->getBasePtr(), MVT::i1, Store->getMemOperand());
10905 }
10906
10907 assert(VT.isVector() &&
10908 Store->getValue().getValueType().getScalarType() == MVT::i32);
10909
10910 unsigned AS = Store->getAddressSpace();
10911 if (Subtarget->hasLDSMisalignedBug() &&
10912 AS == AMDGPUAS::FLAT_ADDRESS &&
10913 Store->getAlign().value() < VT.getStoreSize() && VT.getSizeInBits() > 32) {
10914 return SplitVectorStore(Op, DAG);
10915 }
10916
10919 // If there is a possibility that flat instruction access scratch memory
10920 // then we need to use the same legalization rules we use for private.
10921 if (AS == AMDGPUAS::FLAT_ADDRESS &&
10923 AS = addressMayBeAccessedAsPrivate(Store->getMemOperand(), *MFI) ?
10925
10926 unsigned NumElements = VT.getVectorNumElements();
10927 if (AS == AMDGPUAS::GLOBAL_ADDRESS ||
10928 AS == AMDGPUAS::FLAT_ADDRESS) {
10929 if (NumElements > 4)
10930 return SplitVectorStore(Op, DAG);
10931 // v3 stores not supported on SI.
10932 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
10933 return SplitVectorStore(Op, DAG);
10934
10936 VT, *Store->getMemOperand()))
10937 return expandUnalignedStore(Store, DAG);
10938
10939 return SDValue();
10940 }
10941 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
10942 switch (Subtarget->getMaxPrivateElementSize()) {
10943 case 4:
10944 return scalarizeVectorStore(Store, DAG);
10945 case 8:
10946 if (NumElements > 2)
10947 return SplitVectorStore(Op, DAG);
10948 return SDValue();
10949 case 16:
10950 if (NumElements > 4 ||
10951 (NumElements == 3 && !Subtarget->enableFlatScratch()))
10952 return SplitVectorStore(Op, DAG);
10953 return SDValue();
10954 default:
10955 llvm_unreachable("unsupported private_element_size");
10956 }
10957 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
10958 unsigned Fast = 0;
10959 auto Flags = Store->getMemOperand()->getFlags();
10961 Store->getAlign(), Flags, &Fast) &&
10962 Fast > 1)
10963 return SDValue();
10964
10965 if (VT.isVector())
10966 return SplitVectorStore(Op, DAG);
10967
10968 return expandUnalignedStore(Store, DAG);
10969 }
10970
10971 // Probably an invalid store. If so we'll end up emitting a selection error.
10972 return SDValue();
10973}
10974
10975// Avoid the full correct expansion for f32 sqrt when promoting from f16.
10976SDValue SITargetLowering::lowerFSQRTF16(SDValue Op, SelectionDAG &DAG) const {
10977 SDLoc SL(Op);
10978 assert(!Subtarget->has16BitInsts());
10979 SDNodeFlags Flags = Op->getFlags();
10980 SDValue Ext =
10981 DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op.getOperand(0), Flags);
10982
10983 SDValue SqrtID = DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, SL, MVT::i32);
10984 SDValue Sqrt =
10985 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::f32, SqrtID, Ext, Flags);
10986
10987 return DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Sqrt,
10988 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
10989}
10990
10991SDValue SITargetLowering::lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const {
10992 SDLoc DL(Op);
10993 SDNodeFlags Flags = Op->getFlags();
10994 MVT VT = Op.getValueType().getSimpleVT();
10995 const SDValue X = Op.getOperand(0);
10996
10997 if (allowApproxFunc(DAG, Flags)) {
10998 // Instruction is 1ulp but ignores denormals.
10999 return DAG.getNode(
11001 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32), X, Flags);
11002 }
11003
11004 SDValue ScaleThreshold = DAG.getConstantFP(0x1.0p-96f, DL, VT);
11005 SDValue NeedScale = DAG.getSetCC(DL, MVT::i1, X, ScaleThreshold, ISD::SETOLT);
11006
11007 SDValue ScaleUpFactor = DAG.getConstantFP(0x1.0p+32f, DL, VT);
11008
11009 SDValue ScaledX = DAG.getNode(ISD::FMUL, DL, VT, X, ScaleUpFactor, Flags);
11010
11011 SDValue SqrtX =
11012 DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledX, X, Flags);
11013
11014 SDValue SqrtS;
11015 if (needsDenormHandlingF32(DAG, X, Flags)) {
11016 SDValue SqrtID =
11017 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32);
11018 SqrtS = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, SqrtID, SqrtX, Flags);
11019
11020 SDValue SqrtSAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, SqrtS);
11021 SDValue SqrtSNextDownInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
11022 DAG.getConstant(-1, DL, MVT::i32));
11023 SDValue SqrtSNextDown = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextDownInt);
11024
11025 SDValue NegSqrtSNextDown =
11026 DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextDown, Flags);
11027
11028 SDValue SqrtVP =
11029 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
11030
11031 SDValue SqrtSNextUpInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
11032 DAG.getConstant(1, DL, MVT::i32));
11033 SDValue SqrtSNextUp = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextUpInt);
11034
11035 SDValue NegSqrtSNextUp = DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextUp, Flags);
11036 SDValue SqrtVS =
11037 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
11038
11039 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
11040 SDValue SqrtVPLE0 = DAG.getSetCC(DL, MVT::i1, SqrtVP, Zero, ISD::SETOLE);
11041
11042 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPLE0, SqrtSNextDown, SqrtS,
11043 Flags);
11044
11045 SDValue SqrtVPVSGT0 = DAG.getSetCC(DL, MVT::i1, SqrtVS, Zero, ISD::SETOGT);
11046 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPVSGT0, SqrtSNextUp, SqrtS,
11047 Flags);
11048 } else {
11049 SDValue SqrtR = DAG.getNode(AMDGPUISD::RSQ, DL, VT, SqrtX, Flags);
11050
11051 SqrtS = DAG.getNode(ISD::FMUL, DL, VT, SqrtX, SqrtR, Flags);
11052
11053 SDValue Half = DAG.getConstantFP(0.5f, DL, VT);
11054 SDValue SqrtH = DAG.getNode(ISD::FMUL, DL, VT, SqrtR, Half, Flags);
11055 SDValue NegSqrtH = DAG.getNode(ISD::FNEG, DL, VT, SqrtH, Flags);
11056
11057 SDValue SqrtE = DAG.getNode(ISD::FMA, DL, VT, NegSqrtH, SqrtS, Half, Flags);
11058 SqrtH = DAG.getNode(ISD::FMA, DL, VT, SqrtH, SqrtE, SqrtH, Flags);
11059 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtS, SqrtE, SqrtS, Flags);
11060
11061 SDValue NegSqrtS = DAG.getNode(ISD::FNEG, DL, VT, SqrtS, Flags);
11062 SDValue SqrtD =
11063 DAG.getNode(ISD::FMA, DL, VT, NegSqrtS, SqrtS, SqrtX, Flags);
11064 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtD, SqrtH, SqrtS, Flags);
11065 }
11066
11067 SDValue ScaleDownFactor = DAG.getConstantFP(0x1.0p-16f, DL, VT);
11068
11069 SDValue ScaledDown =
11070 DAG.getNode(ISD::FMUL, DL, VT, SqrtS, ScaleDownFactor, Flags);
11071
11072 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledDown, SqrtS, Flags);
11073 SDValue IsZeroOrInf =
11074 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
11075 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
11076
11077 return DAG.getNode(ISD::SELECT, DL, VT, IsZeroOrInf, SqrtX, SqrtS, Flags);
11078}
11079
11080SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const {
11081 // For double type, the SQRT and RSQ instructions don't have required
11082 // precision, we apply Goldschmidt's algorithm to improve the result:
11083 //
11084 // y0 = rsq(x)
11085 // g0 = x * y0
11086 // h0 = 0.5 * y0
11087 //
11088 // r0 = 0.5 - h0 * g0
11089 // g1 = g0 * r0 + g0
11090 // h1 = h0 * r0 + h0
11091 //
11092 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
11093 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
11094 // h2 = h1 * r1 + h1
11095 //
11096 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
11097 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
11098 //
11099 // sqrt(x) = g3
11100
11101 SDNodeFlags Flags = Op->getFlags();
11102
11103 SDLoc DL(Op);
11104
11105 SDValue X = Op.getOperand(0);
11106 SDValue ScaleConstant = DAG.getConstantFP(0x1.0p-767, DL, MVT::f64);
11107
11108 SDValue Scaling = DAG.getSetCC(DL, MVT::i1, X, ScaleConstant, ISD::SETOLT);
11109
11110 SDValue ZeroInt = DAG.getConstant(0, DL, MVT::i32);
11111
11112 // Scale up input if it is too small.
11113 SDValue ScaleUpFactor = DAG.getConstant(256, DL, MVT::i32);
11114 SDValue ScaleUp =
11115 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleUpFactor, ZeroInt);
11116 SDValue SqrtX = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, X, ScaleUp, Flags);
11117
11118 SDValue SqrtY = DAG.getNode(AMDGPUISD::RSQ, DL, MVT::f64, SqrtX);
11119
11120 SDValue SqrtS0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtX, SqrtY);
11121
11122 SDValue Half = DAG.getConstantFP(0.5, DL, MVT::f64);
11123 SDValue SqrtH0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtY, Half);
11124
11125 SDValue NegSqrtH0 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtH0);
11126 SDValue SqrtR0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtH0, SqrtS0, Half);
11127
11128 SDValue SqrtH1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtH0, SqrtR0, SqrtH0);
11129
11130 SDValue SqrtS1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtS0, SqrtR0, SqrtS0);
11131
11132 SDValue NegSqrtS1 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS1);
11133 SDValue SqrtD0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS1, SqrtS1, SqrtX);
11134
11135 SDValue SqrtS2 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD0, SqrtH1, SqrtS1);
11136
11137 SDValue NegSqrtS2 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS2);
11138 SDValue SqrtD1 =
11139 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS2, SqrtS2, SqrtX);
11140
11141 SDValue SqrtRet = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD1, SqrtH1, SqrtS2);
11142
11143 SDValue ScaleDownFactor = DAG.getConstant(-128, DL, MVT::i32);
11144 SDValue ScaleDown =
11145 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleDownFactor, ZeroInt);
11146 SqrtRet = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, SqrtRet, ScaleDown, Flags);
11147
11148 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
11149 // with finite only or nsz because rsq(+/-0) = +/-inf
11150
11151 // TODO: Check for DAZ and expand to subnormals
11152 SDValue IsZeroOrInf =
11153 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
11154 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
11155
11156 // If x is +INF, +0, or -0, use its original value
11157 return DAG.getNode(ISD::SELECT, DL, MVT::f64, IsZeroOrInf, SqrtX, SqrtRet,
11158 Flags);
11159}
11160
11161SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
11162 SDLoc DL(Op);
11163 EVT VT = Op.getValueType();
11164 SDValue Arg = Op.getOperand(0);
11165 SDValue TrigVal;
11166
11167 // Propagate fast-math flags so that the multiply we introduce can be folded
11168 // if Arg is already the result of a multiply by constant.
11169 auto Flags = Op->getFlags();
11170
11171 SDValue OneOver2Pi = DAG.getConstantFP(0.5 * numbers::inv_pi, DL, VT);
11172
11173 if (Subtarget->hasTrigReducedRange()) {
11174 SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
11175 TrigVal = DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal, Flags);
11176 } else {
11177 TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
11178 }
11179
11180 switch (Op.getOpcode()) {
11181 case ISD::FCOS:
11182 return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal, Flags);
11183 case ISD::FSIN:
11184 return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal, Flags);
11185 default:
11186 llvm_unreachable("Wrong trig opcode");
11187 }
11188}
11189
11190SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const {
11191 AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
11192 assert(AtomicNode->isCompareAndSwap());
11193 unsigned AS = AtomicNode->getAddressSpace();
11194
11195 // No custom lowering required for local address space
11197 return Op;
11198
11199 // Non-local address space requires custom lowering for atomic compare
11200 // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
11201 SDLoc DL(Op);
11202 SDValue ChainIn = Op.getOperand(0);
11203 SDValue Addr = Op.getOperand(1);
11204 SDValue Old = Op.getOperand(2);
11205 SDValue New = Op.getOperand(3);
11206 EVT VT = Op.getValueType();
11207 MVT SimpleVT = VT.getSimpleVT();
11208 MVT VecType = MVT::getVectorVT(SimpleVT, 2);
11209
11210 SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
11211 SDValue Ops[] = { ChainIn, Addr, NewOld };
11212
11213 return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL, Op->getVTList(),
11214 Ops, VT, AtomicNode->getMemOperand());
11215}
11216
11217//===----------------------------------------------------------------------===//
11218// Custom DAG optimizations
11219//===----------------------------------------------------------------------===//
11220
11221SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
11222 DAGCombinerInfo &DCI) const {
11223 EVT VT = N->getValueType(0);
11224 EVT ScalarVT = VT.getScalarType();
11225 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
11226 return SDValue();
11227
11228 SelectionDAG &DAG = DCI.DAG;
11229 SDLoc DL(N);
11230
11231 SDValue Src = N->getOperand(0);
11232 EVT SrcVT = Src.getValueType();
11233
11234 // TODO: We could try to match extracting the higher bytes, which would be
11235 // easier if i8 vectors weren't promoted to i32 vectors, particularly after
11236 // types are legalized. v4i8 -> v4f32 is probably the only case to worry
11237 // about in practice.
11238 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
11239 if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
11240 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, MVT::f32, Src);
11241 DCI.AddToWorklist(Cvt.getNode());
11242
11243 // For the f16 case, fold to a cast to f32 and then cast back to f16.
11244 if (ScalarVT != MVT::f32) {
11245 Cvt = DAG.getNode(ISD::FP_ROUND, DL, VT, Cvt,
11246 DAG.getTargetConstant(0, DL, MVT::i32));
11247 }
11248 return Cvt;
11249 }
11250 }
11251
11252 return SDValue();
11253}
11254
11255SDValue SITargetLowering::performFCopySignCombine(SDNode *N,
11256 DAGCombinerInfo &DCI) const {
11257 SDValue MagnitudeOp = N->getOperand(0);
11258 SDValue SignOp = N->getOperand(1);
11259 SelectionDAG &DAG = DCI.DAG;
11260 SDLoc DL(N);
11261
11262 // f64 fcopysign is really an f32 copysign on the high bits, so replace the
11263 // lower half with a copy.
11264 // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
11265 if (MagnitudeOp.getValueType() == MVT::f64) {
11266 SDValue MagAsVector = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, MagnitudeOp);
11267 SDValue MagLo =
11268 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
11269 DAG.getConstant(0, DL, MVT::i32));
11270 SDValue MagHi =
11271 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
11272 DAG.getConstant(1, DL, MVT::i32));
11273
11274 SDValue HiOp =
11275 DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOp);
11276
11277 SDValue Vector = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp);
11278
11279 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector);
11280 }
11281
11282 if (SignOp.getValueType() != MVT::f64)
11283 return SDValue();
11284
11285 // Reduce width of sign operand, we only need the highest bit.
11286 //
11287 // fcopysign f64:x, f64:y ->
11288 // fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1)
11289 // TODO: In some cases it might make sense to go all the way to f16.
11290 SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, SignOp);
11291 SDValue SignAsF32 =
11292 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector,
11293 DAG.getConstant(1, DL, MVT::i32));
11294
11295 return DAG.getNode(ISD::FCOPYSIGN, DL, N->getValueType(0), N->getOperand(0),
11296 SignAsF32);
11297}
11298
11299// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
11300// (shl (or x, c1), c2) -> add (shl x, c2), (shl c1, c2) iff x and c1 share no
11301// bits
11302
11303// This is a variant of
11304// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
11305//
11306// The normal DAG combiner will do this, but only if the add has one use since
11307// that would increase the number of instructions.
11308//
11309// This prevents us from seeing a constant offset that can be folded into a
11310// memory instruction's addressing mode. If we know the resulting add offset of
11311// a pointer can be folded into an addressing offset, we can replace the pointer
11312// operand with the add of new constant offset. This eliminates one of the uses,
11313// and may allow the remaining use to also be simplified.
11314//
11315SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
11316 unsigned AddrSpace,
11317 EVT MemVT,
11318 DAGCombinerInfo &DCI) const {
11319 SDValue N0 = N->getOperand(0);
11320 SDValue N1 = N->getOperand(1);
11321
11322 // We only do this to handle cases where it's profitable when there are
11323 // multiple uses of the add, so defer to the standard combine.
11324 if ((N0.getOpcode() != ISD::ADD && N0.getOpcode() != ISD::OR) ||
11325 N0->hasOneUse())
11326 return SDValue();
11327
11328 const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
11329 if (!CN1)
11330 return SDValue();
11331
11332 const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
11333 if (!CAdd)
11334 return SDValue();
11335
11336 SelectionDAG &DAG = DCI.DAG;
11337
11338 if (N0->getOpcode() == ISD::OR &&
11339 !DAG.haveNoCommonBitsSet(N0.getOperand(0), N0.getOperand(1)))
11340 return SDValue();
11341
11342 // If the resulting offset is too large, we can't fold it into the
11343 // addressing mode offset.
11344 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
11345 Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext());
11346
11347 AddrMode AM;
11348 AM.HasBaseReg = true;
11349 AM.BaseOffs = Offset.getSExtValue();
11350 if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace))
11351 return SDValue();
11352
11353 SDLoc SL(N);
11354 EVT VT = N->getValueType(0);
11355
11356 SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
11357 SDValue COffset = DAG.getConstant(Offset, SL, VT);
11358
11360 Flags.setNoUnsignedWrap(N->getFlags().hasNoUnsignedWrap() &&
11361 (N0.getOpcode() == ISD::OR ||
11362 N0->getFlags().hasNoUnsignedWrap()));
11363
11364 return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
11365}
11366
11367/// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset
11368/// by the chain and intrinsic ID. Theoretically we would also need to check the
11369/// specific intrinsic, but they all place the pointer operand first.
11370static unsigned getBasePtrIndex(const MemSDNode *N) {
11371 switch (N->getOpcode()) {
11372 case ISD::STORE:
11375 return 2;
11376 default:
11377 return 1;
11378 }
11379}
11380
11381SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
11382 DAGCombinerInfo &DCI) const {
11383 SelectionDAG &DAG = DCI.DAG;
11384 SDLoc SL(N);
11385
11386 unsigned PtrIdx = getBasePtrIndex(N);
11387 SDValue Ptr = N->getOperand(PtrIdx);
11388
11389 // TODO: We could also do this for multiplies.
11390 if (Ptr.getOpcode() == ISD::SHL) {
11391 SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(),
11392 N->getMemoryVT(), DCI);
11393 if (NewPtr) {
11394 SmallVector<SDValue, 8> NewOps(N->ops());
11395
11396 NewOps[PtrIdx] = NewPtr;
11397 return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
11398 }
11399 }
11400
11401 return SDValue();
11402}
11403
11404static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
11405 return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
11406 (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
11407 (Opc == ISD::XOR && Val == 0);
11408}
11409
11410// Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
11411// will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
11412// integer combine opportunities since most 64-bit operations are decomposed
11413// this way. TODO: We won't want this for SALU especially if it is an inline
11414// immediate.
11415SDValue SITargetLowering::splitBinaryBitConstantOp(
11416 DAGCombinerInfo &DCI,
11417 const SDLoc &SL,
11418 unsigned Opc, SDValue LHS,
11419 const ConstantSDNode *CRHS) const {
11420 uint64_t Val = CRHS->getZExtValue();
11421 uint32_t ValLo = Lo_32(Val);
11422 uint32_t ValHi = Hi_32(Val);
11424
11425 if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
11426 bitOpWithConstantIsReducible(Opc, ValHi)) ||
11427 (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
11428 // If we need to materialize a 64-bit immediate, it will be split up later
11429 // anyway. Avoid creating the harder to understand 64-bit immediate
11430 // materialization.
11431 return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
11432 }
11433
11434 return SDValue();
11435}
11436
11438 if (V.getValueType() != MVT::i1)
11439 return false;
11440 switch (V.getOpcode()) {
11441 default:
11442 break;
11443 case ISD::SETCC:
11445 return true;
11446 case ISD::AND:
11447 case ISD::OR:
11448 case ISD::XOR:
11449 return isBoolSGPR(V.getOperand(0)) && isBoolSGPR(V.getOperand(1));
11450 }
11451 return false;
11452}
11453
11454// If a constant has all zeroes or all ones within each byte return it.
11455// Otherwise return 0.
11457 // 0xff for any zero byte in the mask
11458 uint32_t ZeroByteMask = 0;
11459 if (!(C & 0x000000ff)) ZeroByteMask |= 0x000000ff;
11460 if (!(C & 0x0000ff00)) ZeroByteMask |= 0x0000ff00;
11461 if (!(C & 0x00ff0000)) ZeroByteMask |= 0x00ff0000;
11462 if (!(C & 0xff000000)) ZeroByteMask |= 0xff000000;
11463 uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
11464 if ((NonZeroByteMask & C) != NonZeroByteMask)
11465 return 0; // Partial bytes selected.
11466 return C;
11467}
11468
11469// Check if a node selects whole bytes from its operand 0 starting at a byte
11470// boundary while masking the rest. Returns select mask as in the v_perm_b32
11471// or -1 if not succeeded.
11472// Note byte select encoding:
11473// value 0-3 selects corresponding source byte;
11474// value 0xc selects zero;
11475// value 0xff selects 0xff.
11477 assert(V.getValueSizeInBits() == 32);
11478
11479 if (V.getNumOperands() != 2)
11480 return ~0;
11481
11482 ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(V.getOperand(1));
11483 if (!N1)
11484 return ~0;
11485
11486 uint32_t C = N1->getZExtValue();
11487
11488 switch (V.getOpcode()) {
11489 default:
11490 break;
11491 case ISD::AND:
11492 if (uint32_t ConstMask = getConstantPermuteMask(C))
11493 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
11494 break;
11495
11496 case ISD::OR:
11497 if (uint32_t ConstMask = getConstantPermuteMask(C))
11498 return (0x03020100 & ~ConstMask) | ConstMask;
11499 break;
11500
11501 case ISD::SHL:
11502 if (C % 8)
11503 return ~0;
11504
11505 return uint32_t((0x030201000c0c0c0cull << C) >> 32);
11506
11507 case ISD::SRL:
11508 if (C % 8)
11509 return ~0;
11510
11511 return uint32_t(0x0c0c0c0c03020100ull >> C);
11512 }
11513
11514 return ~0;
11515}
11516
11517SDValue SITargetLowering::performAndCombine(SDNode *N,
11518 DAGCombinerInfo &DCI) const {
11519 if (DCI.isBeforeLegalize())
11520 return SDValue();
11521
11522 SelectionDAG &DAG = DCI.DAG;
11523 EVT VT = N->getValueType(0);
11524 SDValue LHS = N->getOperand(0);
11525 SDValue RHS = N->getOperand(1);
11526
11527
11528 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
11529 if (VT == MVT::i64 && CRHS) {
11530 if (SDValue Split
11531 = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
11532 return Split;
11533 }
11534
11535 if (CRHS && VT == MVT::i32) {
11536 // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
11537 // nb = number of trailing zeroes in mask
11538 // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
11539 // given that we are selecting 8 or 16 bit fields starting at byte boundary.
11540 uint64_t Mask = CRHS->getZExtValue();
11541 unsigned Bits = llvm::popcount(Mask);
11542 if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
11543 (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {
11544 if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
11545 unsigned Shift = CShift->getZExtValue();
11546 unsigned NB = CRHS->getAPIntValue().countr_zero();
11547 unsigned Offset = NB + Shift;
11548 if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
11549 SDLoc SL(N);
11550 SDValue BFE = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
11551 LHS->getOperand(0),
11552 DAG.getConstant(Offset, SL, MVT::i32),
11553 DAG.getConstant(Bits, SL, MVT::i32));
11554 EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
11555 SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,
11556 DAG.getValueType(NarrowVT));
11557 SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
11558 DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
11559 return Shl;
11560 }
11561 }
11562 }
11563
11564 // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
11565 if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
11566 isa<ConstantSDNode>(LHS.getOperand(2))) {
11567 uint32_t Sel = getConstantPermuteMask(Mask);
11568 if (!Sel)
11569 return SDValue();
11570
11571 // Select 0xc for all zero bytes
11572 Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
11573 SDLoc DL(N);
11574 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
11575 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
11576 }
11577 }
11578
11579 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
11580 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
11581 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
11582 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
11583 ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
11584
11585 SDValue X = LHS.getOperand(0);
11586 SDValue Y = RHS.getOperand(0);
11587 if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X ||
11588 !isTypeLegal(X.getValueType()))
11589 return SDValue();
11590
11591 if (LCC == ISD::SETO) {
11592 if (X != LHS.getOperand(1))
11593 return SDValue();
11594
11595 if (RCC == ISD::SETUNE) {
11596 const ConstantFPSDNode *C1 = dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
11597 if (!C1 || !C1->isInfinity() || C1->isNegative())
11598 return SDValue();
11599
11606
11607 static_assert(((~(SIInstrFlags::S_NAN |
11610 SIInstrFlags::P_INFINITY)) & 0x3ff) == Mask,
11611 "mask not equal");
11612
11613 SDLoc DL(N);
11614 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
11615 X, DAG.getConstant(Mask, DL, MVT::i32));
11616 }
11617 }
11618 }
11619
11620 if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS)
11621 std::swap(LHS, RHS);
11622
11623 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
11624 RHS.hasOneUse()) {
11625 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
11626 // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan | n_nan)
11627 // and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan | n_nan)
11628 const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
11629 if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask &&
11630 (RHS.getOperand(0) == LHS.getOperand(0) &&
11631 LHS.getOperand(0) == LHS.getOperand(1))) {
11632 const unsigned OrdMask = SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN;
11633 unsigned NewMask = LCC == ISD::SETO ?
11634 Mask->getZExtValue() & ~OrdMask :
11635 Mask->getZExtValue() & OrdMask;
11636
11637 SDLoc DL(N);
11638 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, RHS.getOperand(0),
11639 DAG.getConstant(NewMask, DL, MVT::i32));
11640 }
11641 }
11642
11643 if (VT == MVT::i32 &&
11644 (RHS.getOpcode() == ISD::SIGN_EXTEND || LHS.getOpcode() == ISD::SIGN_EXTEND)) {
11645 // and x, (sext cc from i1) => select cc, x, 0
11646 if (RHS.getOpcode() != ISD::SIGN_EXTEND)
11647 std::swap(LHS, RHS);
11648 if (isBoolSGPR(RHS.getOperand(0)))
11649 return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0),
11650 LHS, DAG.getConstant(0, SDLoc(N), MVT::i32));
11651 }
11652
11653 // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
11655 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
11656 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
11657 uint32_t LHSMask = getPermuteMask(LHS);
11658 uint32_t RHSMask = getPermuteMask(RHS);
11659 if (LHSMask != ~0u && RHSMask != ~0u) {
11660 // Canonicalize the expression in an attempt to have fewer unique masks
11661 // and therefore fewer registers used to hold the masks.
11662 if (LHSMask > RHSMask) {
11663 std::swap(LHSMask, RHSMask);
11664 std::swap(LHS, RHS);
11665 }
11666
11667 // Select 0xc for each lane used from source operand. Zero has 0xc mask
11668 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
11669 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
11670 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
11671
11672 // Check of we need to combine values from two sources within a byte.
11673 if (!(LHSUsedLanes & RHSUsedLanes) &&
11674 // If we select high and lower word keep it for SDWA.
11675 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
11676 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
11677 // Each byte in each mask is either selector mask 0-3, or has higher
11678 // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
11679 // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
11680 // mask which is not 0xff wins. By anding both masks we have a correct
11681 // result except that 0x0c shall be corrected to give 0x0c only.
11682 uint32_t Mask = LHSMask & RHSMask;
11683 for (unsigned I = 0; I < 32; I += 8) {
11684 uint32_t ByteSel = 0xff << I;
11685 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
11686 Mask &= (0x0c << I) & 0xffffffff;
11687 }
11688
11689 // Add 4 to each active LHS lane. It will not affect any existing 0xff
11690 // or 0x0c.
11691 uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
11692 SDLoc DL(N);
11693
11694 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32,
11695 LHS.getOperand(0), RHS.getOperand(0),
11696 DAG.getConstant(Sel, DL, MVT::i32));
11697 }
11698 }
11699 }
11700
11701 return SDValue();
11702}
11703
11704// A key component of v_perm is a mapping between byte position of the src
11705// operands, and the byte position of the dest. To provide such, we need: 1. the
11706// node that provides x byte of the dest of the OR, and 2. the byte of the node
11707// used to provide that x byte. calculateByteProvider finds which node provides
11708// a certain byte of the dest of the OR, and calculateSrcByte takes that node,
11709// and finds an ultimate src and byte position For example: The supported
11710// LoadCombine pattern for vector loads is as follows
11711// t1
11712// or
11713// / \
11714// t2 t3
11715// zext shl
11716// | | \
11717// t4 t5 16
11718// or anyext
11719// / \ |
11720// t6 t7 t8
11721// srl shl or
11722// / | / \ / \
11723// t9 t10 t11 t12 t13 t14
11724// trunc* 8 trunc* 8 and and
11725// | | / | | \
11726// t15 t16 t17 t18 t19 t20
11727// trunc* 255 srl -256
11728// | / \
11729// t15 t15 16
11730//
11731// *In this example, the truncs are from i32->i16
11732//
11733// calculateByteProvider would find t6, t7, t13, and t14 for bytes 0-3
11734// respectively. calculateSrcByte would find (given node) -> ultimate src &
11735// byteposition: t6 -> t15 & 1, t7 -> t16 & 0, t13 -> t15 & 0, t14 -> t15 & 3.
11736// After finding the mapping, we can combine the tree into vperm t15, t16,
11737// 0x05000407
11738
11739// Find the source and byte position from a node.
11740// \p DestByte is the byte position of the dest of the or that the src
11741// ultimately provides. \p SrcIndex is the byte of the src that maps to this
11742// dest of the or byte. \p Depth tracks how many recursive iterations we have
11743// performed.
11744static const std::optional<ByteProvider<SDValue>>
11745calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
11746 unsigned Depth = 0) {
11747 // We may need to recursively traverse a series of SRLs
11748 if (Depth >= 6)
11749 return std::nullopt;
11750
11751 if (Op.getValueSizeInBits() < 8)
11752 return std::nullopt;
11753
11754 if (Op.getValueType().isVector())
11755 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
11756
11757 switch (Op->getOpcode()) {
11758 case ISD::TRUNCATE: {
11759 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
11760 }
11761
11762 case ISD::SIGN_EXTEND:
11763 case ISD::ZERO_EXTEND:
11765 SDValue NarrowOp = Op->getOperand(0);
11766 auto NarrowVT = NarrowOp.getValueType();
11767 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG) {
11768 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
11769 NarrowVT = VTSign->getVT();
11770 }
11771 if (!NarrowVT.isByteSized())
11772 return std::nullopt;
11773 uint64_t NarrowByteWidth = NarrowVT.getStoreSize();
11774
11775 if (SrcIndex >= NarrowByteWidth)
11776 return std::nullopt;
11777 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
11778 }
11779
11780 case ISD::SRA:
11781 case ISD::SRL: {
11782 auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
11783 if (!ShiftOp)
11784 return std::nullopt;
11785
11786 uint64_t BitShift = ShiftOp->getZExtValue();
11787
11788 if (BitShift % 8 != 0)
11789 return std::nullopt;
11790
11791 SrcIndex += BitShift / 8;
11792
11793 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
11794 }
11795
11796 default: {
11797 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
11798 }
11799 }
11800 llvm_unreachable("fully handled switch");
11801}
11802
11803// For a byte position in the result of an Or, traverse the tree and find the
11804// node (and the byte of the node) which ultimately provides this {Or,
11805// BytePosition}. \p Op is the operand we are currently examining. \p Index is
11806// the byte position of the Op that corresponds with the originally requested
11807// byte of the Or \p Depth tracks how many recursive iterations we have
11808// performed. \p StartingIndex is the originally requested byte of the Or
11809static const std::optional<ByteProvider<SDValue>>
11810calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
11811 unsigned StartingIndex = 0) {
11812 // Finding Src tree of RHS of or typically requires at least 1 additional
11813 // depth
11814 if (Depth > 6)
11815 return std::nullopt;
11816
11817 unsigned BitWidth = Op.getScalarValueSizeInBits();
11818 if (BitWidth % 8 != 0)
11819 return std::nullopt;
11820 if (Index > BitWidth / 8 - 1)
11821 return std::nullopt;
11822
11823 bool IsVec = Op.getValueType().isVector();
11824 switch (Op.getOpcode()) {
11825 case ISD::OR: {
11826 if (IsVec)
11827 return std::nullopt;
11828
11829 auto RHS = calculateByteProvider(Op.getOperand(1), Index, Depth + 1,
11830 StartingIndex);
11831 if (!RHS)
11832 return std::nullopt;
11833 auto LHS = calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
11834 StartingIndex);
11835 if (!LHS)
11836 return std::nullopt;
11837 // A well formed Or will have two ByteProviders for each byte, one of which
11838 // is constant zero
11839 if (!LHS->isConstantZero() && !RHS->isConstantZero())
11840 return std::nullopt;
11841 if (!LHS || LHS->isConstantZero())
11842 return RHS;
11843 if (!RHS || RHS->isConstantZero())
11844 return LHS;
11845 return std::nullopt;
11846 }
11847
11848 case ISD::AND: {
11849 if (IsVec)
11850 return std::nullopt;
11851
11852 auto BitMaskOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
11853 if (!BitMaskOp)
11854 return std::nullopt;
11855
11856 uint32_t BitMask = BitMaskOp->getZExtValue();
11857 // Bits we expect for our StartingIndex
11858 uint32_t IndexMask = 0xFF << (Index * 8);
11859
11860 if ((IndexMask & BitMask) != IndexMask) {
11861 // If the result of the and partially provides the byte, then it
11862 // is not well formatted
11863 if (IndexMask & BitMask)
11864 return std::nullopt;
11866 }
11867
11868 return calculateSrcByte(Op->getOperand(0), StartingIndex, Index);
11869 }
11870
11871 case ISD::FSHR: {
11872 if (IsVec)
11873 return std::nullopt;
11874
11875 // fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW))
11876 auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(2));
11877 if (!ShiftOp || Op.getValueType().isVector())
11878 return std::nullopt;
11879
11880 uint64_t BitsProvided = Op.getValueSizeInBits();
11881 if (BitsProvided % 8 != 0)
11882 return std::nullopt;
11883
11884 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
11885 if (BitShift % 8)
11886 return std::nullopt;
11887
11888 uint64_t ConcatSizeInBytes = BitsProvided / 4;
11889 uint64_t ByteShift = BitShift / 8;
11890
11891 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
11892 uint64_t BytesProvided = BitsProvided / 8;
11893 SDValue NextOp = Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
11894 NewIndex %= BytesProvided;
11895 return calculateByteProvider(NextOp, NewIndex, Depth + 1, StartingIndex);
11896 }
11897
11898 case ISD::SRA:
11899 case ISD::SRL: {
11900 if (IsVec)
11901 return std::nullopt;
11902
11903 auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
11904 if (!ShiftOp)
11905 return std::nullopt;
11906
11907 uint64_t BitShift = ShiftOp->getZExtValue();
11908 if (BitShift % 8)
11909 return std::nullopt;
11910
11911 auto BitsProvided = Op.getScalarValueSizeInBits();
11912 if (BitsProvided % 8 != 0)
11913 return std::nullopt;
11914
11915 uint64_t BytesProvided = BitsProvided / 8;
11916 uint64_t ByteShift = BitShift / 8;
11917 // The dest of shift will have good [0 : (BytesProvided - ByteShift)] bytes.
11918 // If the byte we are trying to provide (as tracked by index) falls in this
11919 // range, then the SRL provides the byte. The byte of interest of the src of
11920 // the SRL is Index + ByteShift
11921 return BytesProvided - ByteShift > Index
11922 ? calculateSrcByte(Op->getOperand(0), StartingIndex,
11923 Index + ByteShift)
11925 }
11926
11927 case ISD::SHL: {
11928 if (IsVec)
11929 return std::nullopt;
11930
11931 auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
11932 if (!ShiftOp)
11933 return std::nullopt;
11934
11935 uint64_t BitShift = ShiftOp->getZExtValue();
11936 if (BitShift % 8 != 0)
11937 return std::nullopt;
11938 uint64_t ByteShift = BitShift / 8;
11939
11940 // If we are shifting by an amount greater than (or equal to)
11941 // the index we are trying to provide, then it provides 0s. If not,
11942 // then this bytes are not definitively 0s, and the corresponding byte
11943 // of interest is Index - ByteShift of the src
11944 return Index < ByteShift
11946 : calculateByteProvider(Op.getOperand(0), Index - ByteShift,
11947 Depth + 1, StartingIndex);
11948 }
11949 case ISD::ANY_EXTEND:
11950 case ISD::SIGN_EXTEND:
11951 case ISD::ZERO_EXTEND:
11953 case ISD::AssertZext:
11954 case ISD::AssertSext: {
11955 if (IsVec)
11956 return std::nullopt;
11957
11958 SDValue NarrowOp = Op->getOperand(0);
11959 unsigned NarrowBitWidth = NarrowOp.getValueSizeInBits();
11960 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG ||
11961 Op->getOpcode() == ISD::AssertZext ||
11962 Op->getOpcode() == ISD::AssertSext) {
11963 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
11964 NarrowBitWidth = VTSign->getVT().getSizeInBits();
11965 }
11966 if (NarrowBitWidth % 8 != 0)
11967 return std::nullopt;
11968 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
11969
11970 if (Index >= NarrowByteWidth)
11971 return Op.getOpcode() == ISD::ZERO_EXTEND
11972 ? std::optional<ByteProvider<SDValue>>(
11974 : std::nullopt;
11975 return calculateByteProvider(NarrowOp, Index, Depth + 1, StartingIndex);
11976 }
11977
11978 case ISD::TRUNCATE: {
11979 if (IsVec)
11980 return std::nullopt;
11981
11982 uint64_t NarrowByteWidth = BitWidth / 8;
11983
11984 if (NarrowByteWidth >= Index) {
11985 return calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
11986 StartingIndex);
11987 }
11988
11989 return std::nullopt;
11990 }
11991
11992 case ISD::CopyFromReg: {
11993 if (BitWidth / 8 > Index)
11994 return calculateSrcByte(Op, StartingIndex, Index);
11995
11996 return std::nullopt;
11997 }
11998
11999 case ISD::LOAD: {
12000 auto L = cast<LoadSDNode>(Op.getNode());
12001
12002 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
12003 if (NarrowBitWidth % 8 != 0)
12004 return std::nullopt;
12005 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
12006
12007 // If the width of the load does not reach byte we are trying to provide for
12008 // and it is not a ZEXTLOAD, then the load does not provide for the byte in
12009 // question
12010 if (Index >= NarrowByteWidth) {
12011 return L->getExtensionType() == ISD::ZEXTLOAD
12012 ? std::optional<ByteProvider<SDValue>>(
12014 : std::nullopt;
12015 }
12016
12017 if (NarrowByteWidth > Index) {
12018 return calculateSrcByte(Op, StartingIndex, Index);
12019 }
12020
12021 return std::nullopt;
12022 }
12023
12024 case ISD::BSWAP: {
12025 if (IsVec)
12026 return std::nullopt;
12027
12028 return calculateByteProvider(Op->getOperand(0), BitWidth / 8 - Index - 1,
12029 Depth + 1, StartingIndex);
12030 }
12031
12033 auto IdxOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
12034 if (!IdxOp)
12035 return std::nullopt;
12036 auto VecIdx = IdxOp->getZExtValue();
12037 auto ScalarSize = Op.getScalarValueSizeInBits();
12038 if (ScalarSize < 32)
12039 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
12040 return calculateSrcByte(ScalarSize >= 32 ? Op : Op.getOperand(0),
12041 StartingIndex, Index);
12042 }
12043
12044 case AMDGPUISD::PERM: {
12045 if (IsVec)
12046 return std::nullopt;
12047
12048 auto PermMask = dyn_cast<ConstantSDNode>(Op->getOperand(2));
12049 if (!PermMask)
12050 return std::nullopt;
12051
12052 auto IdxMask =
12053 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
12054 if (IdxMask > 0x07 && IdxMask != 0x0c)
12055 return std::nullopt;
12056
12057 auto NextOp = Op.getOperand(IdxMask > 0x03 ? 0 : 1);
12058 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
12059
12060 return IdxMask != 0x0c ? calculateSrcByte(NextOp, StartingIndex, NextIndex)
12063 }
12064
12065 default: {
12066 return std::nullopt;
12067 }
12068 }
12069
12070 llvm_unreachable("fully handled switch");
12071}
12072
12073// Returns true if the Operand is a scalar and is 16 bits
12074static bool isExtendedFrom16Bits(SDValue &Operand) {
12075
12076 switch (Operand.getOpcode()) {
12077 case ISD::ANY_EXTEND:
12078 case ISD::SIGN_EXTEND:
12079 case ISD::ZERO_EXTEND: {
12080 auto OpVT = Operand.getOperand(0).getValueType();
12081 return !OpVT.isVector() && OpVT.getSizeInBits() == 16;
12082 }
12083 case ISD::LOAD: {
12084 LoadSDNode *L = cast<LoadSDNode>(Operand.getNode());
12085 auto ExtType = cast<LoadSDNode>(L)->getExtensionType();
12086 if (ExtType == ISD::ZEXTLOAD || ExtType == ISD::SEXTLOAD ||
12087 ExtType == ISD::EXTLOAD) {
12088 auto MemVT = L->getMemoryVT();
12089 return !MemVT.isVector() && MemVT.getSizeInBits() == 16;
12090 }
12091 return L->getMemoryVT().getSizeInBits() == 16;
12092 }
12093 default:
12094 return false;
12095 }
12096}
12097
12098// Returns true if the mask matches consecutive bytes, and the first byte
12099// begins at a power of 2 byte offset from 0th byte
12100static bool addresses16Bits(int Mask) {
12101 int Low8 = Mask & 0xff;
12102 int Hi8 = (Mask & 0xff00) >> 8;
12103
12104 assert(Low8 < 8 && Hi8 < 8);
12105 // Are the bytes contiguous in the order of increasing addresses.
12106 bool IsConsecutive = (Hi8 - Low8 == 1);
12107 // Is the first byte at location that is aligned for 16 bit instructions.
12108 // A counter example is taking 2 consecutive bytes starting at the 8th bit.
12109 // In this case, we still need code to extract the 16 bit operand, so it
12110 // is better to use i8 v_perm
12111 bool Is16Aligned = !(Low8 % 2);
12112
12113 return IsConsecutive && Is16Aligned;
12114}
12115
12116// Do not lower into v_perm if the operands are actually 16 bit
12117// and the selected bits (based on PermMask) correspond with two
12118// easily addressable 16 bit operands.
12120 SDValue &OtherOp) {
12121 int Low16 = PermMask & 0xffff;
12122 int Hi16 = (PermMask & 0xffff0000) >> 16;
12123
12124 auto TempOp = peekThroughBitcasts(Op);
12125 auto TempOtherOp = peekThroughBitcasts(OtherOp);
12126
12127 auto OpIs16Bit =
12128 TempOtherOp.getValueSizeInBits() == 16 || isExtendedFrom16Bits(TempOp);
12129 if (!OpIs16Bit)
12130 return true;
12131
12132 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
12133 isExtendedFrom16Bits(TempOtherOp);
12134 if (!OtherOpIs16Bit)
12135 return true;
12136
12137 // Do we cleanly address both
12138 return !addresses16Bits(Low16) || !addresses16Bits(Hi16);
12139}
12140
12142 unsigned DWordOffset) {
12143 SDValue Ret;
12144
12145 auto TypeSize = Src.getValueSizeInBits().getFixedValue();
12146 // ByteProvider must be at least 8 bits
12147 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
12148
12149 if (TypeSize <= 32)
12150 return DAG.getBitcastedAnyExtOrTrunc(Src, SL, MVT::i32);
12151
12152 if (Src.getValueType().isVector()) {
12153 auto ScalarTySize = Src.getScalarValueSizeInBits();
12154 auto ScalarTy = Src.getValueType().getScalarType();
12155 if (ScalarTySize == 32) {
12156 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Src,
12157 DAG.getConstant(DWordOffset, SL, MVT::i32));
12158 }
12159 if (ScalarTySize > 32) {
12160 Ret = DAG.getNode(
12161 ISD::EXTRACT_VECTOR_ELT, SL, ScalarTy, Src,
12162 DAG.getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
12163 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
12164 if (ShiftVal)
12165 Ret = DAG.getNode(ISD::SRL, SL, Ret.getValueType(), Ret,
12166 DAG.getConstant(ShiftVal, SL, MVT::i32));
12167 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
12168 }
12169
12170 assert(ScalarTySize < 32);
12171 auto NumElements = TypeSize / ScalarTySize;
12172 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
12173 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
12174 auto NumElementsIn32 = 32 / ScalarTySize;
12175 auto NumAvailElements = DWordOffset < Trunc32Elements
12176 ? NumElementsIn32
12177 : NumElements - NormalizedTrunc;
12178
12180 DAG.ExtractVectorElements(Src, VecSrcs, DWordOffset * NumElementsIn32,
12181 NumAvailElements);
12182
12183 Ret = DAG.getBuildVector(
12184 MVT::getVectorVT(MVT::getIntegerVT(ScalarTySize), NumAvailElements), SL,
12185 VecSrcs);
12186 return Ret = DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
12187 }
12188
12189 /// Scalar Type
12190 auto ShiftVal = 32 * DWordOffset;
12191 Ret = DAG.getNode(ISD::SRL, SL, Src.getValueType(), Src,
12192 DAG.getConstant(ShiftVal, SL, MVT::i32));
12193 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
12194}
12195
12197 SelectionDAG &DAG = DCI.DAG;
12198 [[maybe_unused]] EVT VT = N->getValueType(0);
12200
12201 // VT is known to be MVT::i32, so we need to provide 4 bytes.
12202 assert(VT == MVT::i32);
12203 for (int i = 0; i < 4; i++) {
12204 // Find the ByteProvider that provides the ith byte of the result of OR
12205 std::optional<ByteProvider<SDValue>> P =
12206 calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i);
12207 // TODO support constantZero
12208 if (!P || P->isConstantZero())
12209 return SDValue();
12210
12211 PermNodes.push_back(*P);
12212 }
12213 if (PermNodes.size() != 4)
12214 return SDValue();
12215
12216 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
12217 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
12218 uint64_t PermMask = 0x00000000;
12219 for (size_t i = 0; i < PermNodes.size(); i++) {
12220 auto PermOp = PermNodes[i];
12221 // Since the mask is applied to Src1:Src2, Src1 bytes must be offset
12222 // by sizeof(Src2) = 4
12223 int SrcByteAdjust = 4;
12224
12225 // If the Src uses a byte from a different DWORD, then it corresponds
12226 // with a difference source
12227 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
12228 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
12229 if (SecondSrc)
12230 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
12231 ((PermOp.SrcOffset / 4) != SecondSrc->second))
12232 return SDValue();
12233
12234 // Set the index of the second distinct Src node
12235 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
12236 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
12237 SrcByteAdjust = 0;
12238 }
12239 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
12241 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
12242 }
12243 SDLoc DL(N);
12244 SDValue Op = *PermNodes[FirstSrc.first].Src;
12245 Op = getDWordFromOffset(DAG, DL, Op, FirstSrc.second);
12246 assert(Op.getValueSizeInBits() == 32);
12247
12248 // Check that we are not just extracting the bytes in order from an op
12249 if (!SecondSrc) {
12250 int Low16 = PermMask & 0xffff;
12251 int Hi16 = (PermMask & 0xffff0000) >> 16;
12252
12253 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
12254 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
12255
12256 // The perm op would really just produce Op. So combine into Op
12257 if (WellFormedLow && WellFormedHi)
12258 return DAG.getBitcast(MVT::getIntegerVT(32), Op);
12259 }
12260
12261 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src : Op;
12262
12263 if (SecondSrc) {
12264 OtherOp = getDWordFromOffset(DAG, DL, OtherOp, SecondSrc->second);
12265 assert(OtherOp.getValueSizeInBits() == 32);
12266 }
12267
12268 if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
12269
12270 assert(Op.getValueType().isByteSized() &&
12271 OtherOp.getValueType().isByteSized());
12272
12273 // If the ultimate src is less than 32 bits, then we will only be
12274 // using bytes 0: Op.getValueSizeInBytes() - 1 in the or.
12275 // CalculateByteProvider would not have returned Op as source if we
12276 // used a byte that is outside its ValueType. Thus, we are free to
12277 // ANY_EXTEND as the extended bits are dont-cares.
12278 Op = DAG.getBitcastedAnyExtOrTrunc(Op, DL, MVT::i32);
12279 OtherOp = DAG.getBitcastedAnyExtOrTrunc(OtherOp, DL, MVT::i32);
12280
12281 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp,
12282 DAG.getConstant(PermMask, DL, MVT::i32));
12283 }
12284 return SDValue();
12285}
12286
12287SDValue SITargetLowering::performOrCombine(SDNode *N,
12288 DAGCombinerInfo &DCI) const {
12289 SelectionDAG &DAG = DCI.DAG;
12290 SDValue LHS = N->getOperand(0);
12291 SDValue RHS = N->getOperand(1);
12292
12293 EVT VT = N->getValueType(0);
12294 if (VT == MVT::i1) {
12295 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
12296 if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
12297 RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
12298 SDValue Src = LHS.getOperand(0);
12299 if (Src != RHS.getOperand(0))
12300 return SDValue();
12301
12302 const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
12303 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
12304 if (!CLHS || !CRHS)
12305 return SDValue();
12306
12307 // Only 10 bits are used.
12308 static const uint32_t MaxMask = 0x3ff;
12309
12310 uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
12311 SDLoc DL(N);
12312 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
12313 Src, DAG.getConstant(NewMask, DL, MVT::i32));
12314 }
12315
12316 return SDValue();
12317 }
12318
12319 // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
12320 if (isa<ConstantSDNode>(RHS) && LHS.hasOneUse() &&
12321 LHS.getOpcode() == AMDGPUISD::PERM &&
12322 isa<ConstantSDNode>(LHS.getOperand(2))) {
12323 uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1));
12324 if (!Sel)
12325 return SDValue();
12326
12327 Sel |= LHS.getConstantOperandVal(2);
12328 SDLoc DL(N);
12329 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
12330 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
12331 }
12332
12333 // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
12335 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
12336 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
12337
12338 // If all the uses of an or need to extract the individual elements, do not
12339 // attempt to lower into v_perm
12340 auto usesCombinedOperand = [](SDNode *OrUse) {
12341 // If we have any non-vectorized use, then it is a candidate for v_perm
12342 if (OrUse->getOpcode() != ISD::BITCAST ||
12343 !OrUse->getValueType(0).isVector())
12344 return true;
12345
12346 // If we have any non-vectorized use, then it is a candidate for v_perm
12347 for (auto VUse : OrUse->uses()) {
12348 if (!VUse->getValueType(0).isVector())
12349 return true;
12350
12351 // If the use of a vector is a store, then combining via a v_perm
12352 // is beneficial.
12353 // TODO -- whitelist more uses
12354 for (auto VectorwiseOp : {ISD::STORE, ISD::CopyToReg, ISD::CopyFromReg})
12355 if (VUse->getOpcode() == VectorwiseOp)
12356 return true;
12357 }
12358 return false;
12359 };
12360
12361 if (!any_of(N->uses(), usesCombinedOperand))
12362 return SDValue();
12363
12364 uint32_t LHSMask = getPermuteMask(LHS);
12365 uint32_t RHSMask = getPermuteMask(RHS);
12366
12367 if (LHSMask != ~0u && RHSMask != ~0u) {
12368 // Canonicalize the expression in an attempt to have fewer unique masks
12369 // and therefore fewer registers used to hold the masks.
12370 if (LHSMask > RHSMask) {
12371 std::swap(LHSMask, RHSMask);
12372 std::swap(LHS, RHS);
12373 }
12374
12375 // Select 0xc for each lane used from source operand. Zero has 0xc mask
12376 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
12377 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12378 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12379
12380 // Check of we need to combine values from two sources within a byte.
12381 if (!(LHSUsedLanes & RHSUsedLanes) &&
12382 // If we select high and lower word keep it for SDWA.
12383 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
12384 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
12385 // Kill zero bytes selected by other mask. Zero value is 0xc.
12386 LHSMask &= ~RHSUsedLanes;
12387 RHSMask &= ~LHSUsedLanes;
12388 // Add 4 to each active LHS lane
12389 LHSMask |= LHSUsedLanes & 0x04040404;
12390 // Combine masks
12391 uint32_t Sel = LHSMask | RHSMask;
12392 SDLoc DL(N);
12393
12394 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32,
12395 LHS.getOperand(0), RHS.getOperand(0),
12396 DAG.getConstant(Sel, DL, MVT::i32));
12397 }
12398 }
12399 if (LHSMask == ~0u || RHSMask == ~0u) {
12400 if (SDValue Perm = matchPERM(N, DCI))
12401 return Perm;
12402 }
12403 }
12404
12405 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
12406 return SDValue();
12407
12408 // TODO: This could be a generic combine with a predicate for extracting the
12409 // high half of an integer being free.
12410
12411 // (or i64:x, (zero_extend i32:y)) ->
12412 // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
12413 if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
12414 RHS.getOpcode() != ISD::ZERO_EXTEND)
12415 std::swap(LHS, RHS);
12416
12417 if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
12418 SDValue ExtSrc = RHS.getOperand(0);
12419 EVT SrcVT = ExtSrc.getValueType();
12420 if (SrcVT == MVT::i32) {
12421 SDLoc SL(N);
12422 SDValue LowLHS, HiBits;
12423 std::tie(LowLHS, HiBits) = split64BitValue(LHS, DAG);
12424 SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
12425
12426 DCI.AddToWorklist(LowOr.getNode());
12427 DCI.AddToWorklist(HiBits.getNode());
12428
12429 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
12430 LowOr, HiBits);
12431 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
12432 }
12433 }
12434
12435 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
12436 if (CRHS) {
12437 if (SDValue Split
12438 = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR,
12439 N->getOperand(0), CRHS))
12440 return Split;
12441 }
12442
12443 return SDValue();
12444}
12445
12446SDValue SITargetLowering::performXorCombine(SDNode *N,
12447 DAGCombinerInfo &DCI) const {
12448 if (SDValue RV = reassociateScalarOps(N, DCI.DAG))
12449 return RV;
12450
12451 SDValue LHS = N->getOperand(0);
12452 SDValue RHS = N->getOperand(1);
12453
12454 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
12455 SelectionDAG &DAG = DCI.DAG;
12456
12457 EVT VT = N->getValueType(0);
12458 if (CRHS && VT == MVT::i64) {
12459 if (SDValue Split
12460 = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
12461 return Split;
12462 }
12463
12464 // Make sure to apply the 64-bit constant splitting fold before trying to fold
12465 // fneg-like xors into 64-bit select.
12466 if (LHS.getOpcode() == ISD::SELECT && VT == MVT::i32) {
12467 // This looks like an fneg, try to fold as a source modifier.
12468 if (CRHS && CRHS->getAPIntValue().isSignMask() &&
12469 shouldFoldFNegIntoSrc(N, LHS)) {
12470 // xor (select c, a, b), 0x80000000 ->
12471 // bitcast (select c, (fneg (bitcast a)), (fneg (bitcast b)))
12472 SDLoc DL(N);
12473 SDValue CastLHS =
12474 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(1));
12475 SDValue CastRHS =
12476 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(2));
12477 SDValue FNegLHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastLHS);
12478 SDValue FNegRHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastRHS);
12479 SDValue NewSelect = DAG.getNode(ISD::SELECT, DL, MVT::f32,
12480 LHS->getOperand(0), FNegLHS, FNegRHS);
12481 return DAG.getNode(ISD::BITCAST, DL, VT, NewSelect);
12482 }
12483 }
12484
12485 return SDValue();
12486}
12487
12488SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
12489 DAGCombinerInfo &DCI) const {
12490 if (!Subtarget->has16BitInsts() ||
12491 DCI.getDAGCombineLevel() < AfterLegalizeDAG)
12492 return SDValue();
12493
12494 EVT VT = N->getValueType(0);
12495 if (VT != MVT::i32)
12496 return SDValue();
12497
12498 SDValue Src = N->getOperand(0);
12499 if (Src.getValueType() != MVT::i16)
12500 return SDValue();
12501
12502 return SDValue();
12503}
12504
12505SDValue
12506SITargetLowering::performSignExtendInRegCombine(SDNode *N,
12507 DAGCombinerInfo &DCI) const {
12508 SDValue Src = N->getOperand(0);
12509 auto *VTSign = cast<VTSDNode>(N->getOperand(1));
12510
12511 // Combine s_buffer_load_u8 or s_buffer_load_u16 with sext and replace them
12512 // with s_buffer_load_i8 and s_buffer_load_i16 respectively.
12513 if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE &&
12514 VTSign->getVT() == MVT::i8) ||
12515 (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT &&
12516 VTSign->getVT() == MVT::i16))) {
12517 assert(Subtarget->hasScalarSubwordLoads() &&
12518 "s_buffer_load_{u8, i8} are supported "
12519 "in GFX12 (or newer) architectures.");
12520 EVT VT = Src.getValueType();
12521 unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE)
12524 SDLoc DL(N);
12525 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
12526 SDValue Ops[] = {
12527 Src.getOperand(0), // source register
12528 Src.getOperand(1), // offset
12529 Src.getOperand(2) // cachePolicy
12530 };
12531 auto *M = cast<MemSDNode>(Src);
12532 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
12533 Opc, DL, ResList, Ops, M->getMemoryVT(), M->getMemOperand());
12534 SDValue LoadVal = DCI.DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
12535 return LoadVal;
12536 }
12537 if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
12538 VTSign->getVT() == MVT::i8) ||
12539 (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
12540 VTSign->getVT() == MVT::i16)) &&
12541 Src.hasOneUse()) {
12542 auto *M = cast<MemSDNode>(Src);
12543 SDValue Ops[] = {
12544 Src.getOperand(0), // Chain
12545 Src.getOperand(1), // rsrc
12546 Src.getOperand(2), // vindex
12547 Src.getOperand(3), // voffset
12548 Src.getOperand(4), // soffset
12549 Src.getOperand(5), // offset
12550 Src.getOperand(6),
12551 Src.getOperand(7)
12552 };
12553 // replace with BUFFER_LOAD_BYTE/SHORT
12554 SDVTList ResList = DCI.DAG.getVTList(MVT::i32,
12555 Src.getOperand(0).getValueType());
12556 unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE) ?
12558 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(Opc, SDLoc(N),
12559 ResList,
12560 Ops, M->getMemoryVT(),
12561 M->getMemOperand());
12562 return DCI.DAG.getMergeValues({BufferLoadSignExt,
12563 BufferLoadSignExt.getValue(1)}, SDLoc(N));
12564 }
12565 return SDValue();
12566}
12567
12568SDValue SITargetLowering::performClassCombine(SDNode *N,
12569 DAGCombinerInfo &DCI) const {
12570 SelectionDAG &DAG = DCI.DAG;
12571 SDValue Mask = N->getOperand(1);
12572
12573 // fp_class x, 0 -> false
12574 if (isNullConstant(Mask))
12575 return DAG.getConstant(0, SDLoc(N), MVT::i1);
12576
12577 if (N->getOperand(0).isUndef())
12578 return DAG.getUNDEF(MVT::i1);
12579
12580 return SDValue();
12581}
12582
12583SDValue SITargetLowering::performRcpCombine(SDNode *N,
12584 DAGCombinerInfo &DCI) const {
12585 EVT VT = N->getValueType(0);
12586 SDValue N0 = N->getOperand(0);
12587
12588 if (N0.isUndef()) {
12589 return DCI.DAG.getConstantFP(APFloat::getQNaN(VT.getFltSemantics()),
12590 SDLoc(N), VT);
12591 }
12592
12593 if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP ||
12594 N0.getOpcode() == ISD::SINT_TO_FP)) {
12595 return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(N), VT, N0,
12596 N->getFlags());
12597 }
12598
12599 // TODO: Could handle f32 + amdgcn.sqrt but probably never reaches here.
12600 if ((VT == MVT::f16 && N0.getOpcode() == ISD::FSQRT) &&
12601 N->getFlags().hasAllowContract() && N0->getFlags().hasAllowContract()) {
12602 return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(N), VT,
12603 N0.getOperand(0), N->getFlags());
12604 }
12605
12607}
12608
12610 unsigned MaxDepth) const {
12611 unsigned Opcode = Op.getOpcode();
12612 if (Opcode == ISD::FCANONICALIZE)
12613 return true;
12614
12615 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
12616 const auto &F = CFP->getValueAPF();
12617 if (F.isNaN() && F.isSignaling())
12618 return false;
12619 if (!F.isDenormal())
12620 return true;
12621
12622 DenormalMode Mode =
12623 DAG.getMachineFunction().getDenormalMode(F.getSemantics());
12624 return Mode == DenormalMode::getIEEE();
12625 }
12626
12627 // If source is a result of another standard FP operation it is already in
12628 // canonical form.
12629 if (MaxDepth == 0)
12630 return false;
12631
12632 switch (Opcode) {
12633 // These will flush denorms if required.
12634 case ISD::FADD:
12635 case ISD::FSUB:
12636 case ISD::FMUL:
12637 case ISD::FCEIL:
12638 case ISD::FFLOOR:
12639 case ISD::FMA:
12640 case ISD::FMAD:
12641 case ISD::FSQRT:
12642 case ISD::FDIV:
12643 case ISD::FREM:
12644 case ISD::FP_ROUND:
12645 case ISD::FP_EXTEND:
12646 case ISD::FP16_TO_FP:
12647 case ISD::FP_TO_FP16:
12648 case ISD::BF16_TO_FP:
12649 case ISD::FP_TO_BF16:
12650 case ISD::FLDEXP:
12653 case AMDGPUISD::RCP:
12654 case AMDGPUISD::RSQ:
12658 case AMDGPUISD::LOG:
12659 case AMDGPUISD::EXP:
12663 case AMDGPUISD::FRACT:
12670 case AMDGPUISD::SIN_HW:
12671 case AMDGPUISD::COS_HW:
12672 return true;
12673
12674 // It can/will be lowered or combined as a bit operation.
12675 // Need to check their input recursively to handle.
12676 case ISD::FNEG:
12677 case ISD::FABS:
12678 case ISD::FCOPYSIGN:
12679 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12680
12681 case ISD::AND:
12682 if (Op.getValueType() == MVT::i32) {
12683 // Be careful as we only know it is a bitcast floating point type. It
12684 // could be f32, v2f16, we have no way of knowing. Luckily the constant
12685 // value that we optimize for, which comes up in fp32 to bf16 conversions,
12686 // is valid to optimize for all types.
12687 if (auto *RHS = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
12688 if (RHS->getZExtValue() == 0xffff0000) {
12689 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12690 }
12691 }
12692 }
12693 break;
12694
12695 case ISD::FSIN:
12696 case ISD::FCOS:
12697 case ISD::FSINCOS:
12698 return Op.getValueType().getScalarType() != MVT::f16;
12699
12700 case ISD::FMINNUM:
12701 case ISD::FMAXNUM:
12702 case ISD::FMINNUM_IEEE:
12703 case ISD::FMAXNUM_IEEE:
12704 case ISD::FMINIMUM:
12705 case ISD::FMAXIMUM:
12706 case AMDGPUISD::CLAMP:
12707 case AMDGPUISD::FMED3:
12708 case AMDGPUISD::FMAX3:
12709 case AMDGPUISD::FMIN3:
12711 case AMDGPUISD::FMINIMUM3: {
12712 // FIXME: Shouldn't treat the generic operations different based these.
12713 // However, we aren't really required to flush the result from
12714 // minnum/maxnum..
12715
12716 // snans will be quieted, so we only need to worry about denormals.
12717 if (Subtarget->supportsMinMaxDenormModes() ||
12718 // FIXME: denormalsEnabledForType is broken for dynamic
12719 denormalsEnabledForType(DAG, Op.getValueType()))
12720 return true;
12721
12722 // Flushing may be required.
12723 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
12724 // targets need to check their input recursively.
12725
12726 // FIXME: Does this apply with clamp? It's implemented with max.
12727 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
12728 if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1))
12729 return false;
12730 }
12731
12732 return true;
12733 }
12734 case ISD::SELECT: {
12735 return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) &&
12736 isCanonicalized(DAG, Op.getOperand(2), MaxDepth - 1);
12737 }
12738 case ISD::BUILD_VECTOR: {
12739 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
12740 SDValue SrcOp = Op.getOperand(i);
12741 if (!isCanonicalized(DAG, SrcOp, MaxDepth - 1))
12742 return false;
12743 }
12744
12745 return true;
12746 }
12749 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12750 }
12752 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
12753 isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
12754 }
12755 case ISD::UNDEF:
12756 // Could be anything.
12757 return false;
12758
12759 case ISD::BITCAST:
12760 // TODO: This is incorrect as it loses track of the operand's type. We may
12761 // end up effectively bitcasting from f32 to v2f16 or vice versa, and the
12762 // same bits that are canonicalized in one type need not be in the other.
12763 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12764 case ISD::TRUNCATE: {
12765 // Hack round the mess we make when legalizing extract_vector_elt
12766 if (Op.getValueType() == MVT::i16) {
12767 SDValue TruncSrc = Op.getOperand(0);
12768 if (TruncSrc.getValueType() == MVT::i32 &&
12769 TruncSrc.getOpcode() == ISD::BITCAST &&
12770 TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {
12771 return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);
12772 }
12773 }
12774 return false;
12775 }
12777 unsigned IntrinsicID = Op.getConstantOperandVal(0);
12778 // TODO: Handle more intrinsics
12779 switch (IntrinsicID) {
12780 case Intrinsic::amdgcn_cvt_pkrtz:
12781 case Intrinsic::amdgcn_cubeid:
12782 case Intrinsic::amdgcn_frexp_mant:
12783 case Intrinsic::amdgcn_fdot2:
12784 case Intrinsic::amdgcn_rcp:
12785 case Intrinsic::amdgcn_rsq:
12786 case Intrinsic::amdgcn_rsq_clamp:
12787 case Intrinsic::amdgcn_rcp_legacy:
12788 case Intrinsic::amdgcn_rsq_legacy:
12789 case Intrinsic::amdgcn_trig_preop:
12790 case Intrinsic::amdgcn_log:
12791 case Intrinsic::amdgcn_exp2:
12792 case Intrinsic::amdgcn_sqrt:
12793 return true;
12794 default:
12795 break;
12796 }
12797
12798 break;
12799 }
12800 default:
12801 break;
12802 }
12803
12804 // FIXME: denormalsEnabledForType is broken for dynamic
12805 return denormalsEnabledForType(DAG, Op.getValueType()) &&
12806 DAG.isKnownNeverSNaN(Op);
12807}
12808
12810 unsigned MaxDepth) const {
12811 const MachineRegisterInfo &MRI = MF.getRegInfo();
12812 MachineInstr *MI = MRI.getVRegDef(Reg);
12813 unsigned Opcode = MI->getOpcode();
12814
12815 if (Opcode == AMDGPU::G_FCANONICALIZE)
12816 return true;
12817
12818 std::optional<FPValueAndVReg> FCR;
12819 // Constant splat (can be padded with undef) or scalar constant.
12820 if (mi_match(Reg, MRI, MIPatternMatch::m_GFCstOrSplat(FCR))) {
12821 if (FCR->Value.isSignaling())
12822 return false;
12823 if (!FCR->Value.isDenormal())
12824 return true;
12825
12826 DenormalMode Mode = MF.getDenormalMode(FCR->Value.getSemantics());
12827 return Mode == DenormalMode::getIEEE();
12828 }
12829
12830 if (MaxDepth == 0)
12831 return false;
12832
12833 switch (Opcode) {
12834 case AMDGPU::G_FADD:
12835 case AMDGPU::G_FSUB:
12836 case AMDGPU::G_FMUL:
12837 case AMDGPU::G_FCEIL:
12838 case AMDGPU::G_FFLOOR:
12839 case AMDGPU::G_FRINT:
12840 case AMDGPU::G_FNEARBYINT:
12841 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
12842 case AMDGPU::G_INTRINSIC_TRUNC:
12843 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
12844 case AMDGPU::G_FMA:
12845 case AMDGPU::G_FMAD:
12846 case AMDGPU::G_FSQRT:
12847 case AMDGPU::G_FDIV:
12848 case AMDGPU::G_FREM:
12849 case AMDGPU::G_FPOW:
12850 case AMDGPU::G_FPEXT:
12851 case AMDGPU::G_FLOG:
12852 case AMDGPU::G_FLOG2:
12853 case AMDGPU::G_FLOG10:
12854 case AMDGPU::G_FPTRUNC:
12855 case AMDGPU::G_AMDGPU_RCP_IFLAG:
12856 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
12857 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
12858 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
12859 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
12860 return true;
12861 case AMDGPU::G_FNEG:
12862 case AMDGPU::G_FABS:
12863 case AMDGPU::G_FCOPYSIGN:
12864 return isCanonicalized(MI->getOperand(1).getReg(), MF, MaxDepth - 1);
12865 case AMDGPU::G_FMINNUM:
12866 case AMDGPU::G_FMAXNUM:
12867 case AMDGPU::G_FMINNUM_IEEE:
12868 case AMDGPU::G_FMAXNUM_IEEE:
12869 case AMDGPU::G_FMINIMUM:
12870 case AMDGPU::G_FMAXIMUM: {
12871 if (Subtarget->supportsMinMaxDenormModes() ||
12872 // FIXME: denormalsEnabledForType is broken for dynamic
12873 denormalsEnabledForType(MRI.getType(Reg), MF))
12874 return true;
12875
12876 [[fallthrough]];
12877 }
12878 case AMDGPU::G_BUILD_VECTOR:
12879 for (const MachineOperand &MO : llvm::drop_begin(MI->operands()))
12880 if (!isCanonicalized(MO.getReg(), MF, MaxDepth - 1))
12881 return false;
12882 return true;
12883 case AMDGPU::G_INTRINSIC:
12884 case AMDGPU::G_INTRINSIC_CONVERGENT:
12885 switch (cast<GIntrinsic>(MI)->getIntrinsicID()) {
12886 case Intrinsic::amdgcn_fmul_legacy:
12887 case Intrinsic::amdgcn_fmad_ftz:
12888 case Intrinsic::amdgcn_sqrt:
12889 case Intrinsic::amdgcn_fmed3:
12890 case Intrinsic::amdgcn_sin:
12891 case Intrinsic::amdgcn_cos:
12892 case Intrinsic::amdgcn_log:
12893 case Intrinsic::amdgcn_exp2:
12894 case Intrinsic::amdgcn_log_clamp:
12895 case Intrinsic::amdgcn_rcp:
12896 case Intrinsic::amdgcn_rcp_legacy:
12897 case Intrinsic::amdgcn_rsq:
12898 case Intrinsic::amdgcn_rsq_clamp:
12899 case Intrinsic::amdgcn_rsq_legacy:
12900 case Intrinsic::amdgcn_div_scale:
12901 case Intrinsic::amdgcn_div_fmas:
12902 case Intrinsic::amdgcn_div_fixup:
12903 case Intrinsic::amdgcn_fract:
12904 case Intrinsic::amdgcn_cvt_pkrtz:
12905 case Intrinsic::amdgcn_cubeid:
12906 case Intrinsic::amdgcn_cubema:
12907 case Intrinsic::amdgcn_cubesc:
12908 case Intrinsic::amdgcn_cubetc:
12909 case Intrinsic::amdgcn_frexp_mant:
12910 case Intrinsic::amdgcn_fdot2:
12911 case Intrinsic::amdgcn_trig_preop:
12912 return true;
12913 default:
12914 break;
12915 }
12916
12917 [[fallthrough]];
12918 default:
12919 return false;
12920 }
12921
12922 llvm_unreachable("invalid operation");
12923}
12924
12925// Constant fold canonicalize.
12926SDValue SITargetLowering::getCanonicalConstantFP(
12927 SelectionDAG &DAG, const SDLoc &SL, EVT VT, const APFloat &C) const {
12928 // Flush denormals to 0 if not enabled.
12929 if (C.isDenormal()) {
12930 DenormalMode Mode =
12931 DAG.getMachineFunction().getDenormalMode(C.getSemantics());
12932 if (Mode == DenormalMode::getPreserveSign()) {
12933 return DAG.getConstantFP(
12934 APFloat::getZero(C.getSemantics(), C.isNegative()), SL, VT);
12935 }
12936
12937 if (Mode != DenormalMode::getIEEE())
12938 return SDValue();
12939 }
12940
12941 if (C.isNaN()) {
12942 APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
12943 if (C.isSignaling()) {
12944 // Quiet a signaling NaN.
12945 // FIXME: Is this supposed to preserve payload bits?
12946 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
12947 }
12948
12949 // Make sure it is the canonical NaN bitpattern.
12950 //
12951 // TODO: Can we use -1 as the canonical NaN value since it's an inline
12952 // immediate?
12953 if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
12954 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
12955 }
12956
12957 // Already canonical.
12958 return DAG.getConstantFP(C, SL, VT);
12959}
12960
12962 return Op.isUndef() || isa<ConstantFPSDNode>(Op);
12963}
12964
12965SDValue SITargetLowering::performFCanonicalizeCombine(
12966 SDNode *N,
12967 DAGCombinerInfo &DCI) const {
12968 SelectionDAG &DAG = DCI.DAG;
12969 SDValue N0 = N->getOperand(0);
12970 EVT VT = N->getValueType(0);
12971
12972 // fcanonicalize undef -> qnan
12973 if (N0.isUndef()) {
12975 return DAG.getConstantFP(QNaN, SDLoc(N), VT);
12976 }
12977
12978 if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N0)) {
12979 EVT VT = N->getValueType(0);
12980 return getCanonicalConstantFP(DAG, SDLoc(N), VT, CFP->getValueAPF());
12981 }
12982
12983 // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
12984 // (fcanonicalize k)
12985 //
12986 // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
12987
12988 // TODO: This could be better with wider vectors that will be split to v2f16,
12989 // and to consider uses since there aren't that many packed operations.
12990 if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
12991 isTypeLegal(MVT::v2f16)) {
12992 SDLoc SL(N);
12993 SDValue NewElts[2];
12994 SDValue Lo = N0.getOperand(0);
12995 SDValue Hi = N0.getOperand(1);
12996 EVT EltVT = Lo.getValueType();
12997
12999 for (unsigned I = 0; I != 2; ++I) {
13000 SDValue Op = N0.getOperand(I);
13001 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
13002 NewElts[I] = getCanonicalConstantFP(DAG, SL, EltVT,
13003 CFP->getValueAPF());
13004 } else if (Op.isUndef()) {
13005 // Handled below based on what the other operand is.
13006 NewElts[I] = Op;
13007 } else {
13008 NewElts[I] = DAG.getNode(ISD::FCANONICALIZE, SL, EltVT, Op);
13009 }
13010 }
13011
13012 // If one half is undef, and one is constant, prefer a splat vector rather
13013 // than the normal qNaN. If it's a register, prefer 0.0 since that's
13014 // cheaper to use and may be free with a packed operation.
13015 if (NewElts[0].isUndef()) {
13016 if (isa<ConstantFPSDNode>(NewElts[1]))
13017 NewElts[0] = isa<ConstantFPSDNode>(NewElts[1]) ?
13018 NewElts[1]: DAG.getConstantFP(0.0f, SL, EltVT);
13019 }
13020
13021 if (NewElts[1].isUndef()) {
13022 NewElts[1] = isa<ConstantFPSDNode>(NewElts[0]) ?
13023 NewElts[0] : DAG.getConstantFP(0.0f, SL, EltVT);
13024 }
13025
13026 return DAG.getBuildVector(VT, SL, NewElts);
13027 }
13028 }
13029
13030 return SDValue();
13031}
13032
13033static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
13034 switch (Opc) {
13035 case ISD::FMAXNUM:
13036 case ISD::FMAXNUM_IEEE:
13037 return AMDGPUISD::FMAX3;
13038 case ISD::FMAXIMUM:
13039 return AMDGPUISD::FMAXIMUM3;
13040 case ISD::SMAX:
13041 return AMDGPUISD::SMAX3;
13042 case ISD::UMAX:
13043 return AMDGPUISD::UMAX3;
13044 case ISD::FMINNUM:
13045 case ISD::FMINNUM_IEEE:
13046 return AMDGPUISD::FMIN3;
13047 case ISD::FMINIMUM:
13048 return AMDGPUISD::FMINIMUM3;
13049 case ISD::SMIN:
13050 return AMDGPUISD::SMIN3;
13051 case ISD::UMIN:
13052 return AMDGPUISD::UMIN3;
13053 default:
13054 llvm_unreachable("Not a min/max opcode");
13055 }
13056}
13057
13058SDValue SITargetLowering::performIntMed3ImmCombine(SelectionDAG &DAG,
13059 const SDLoc &SL, SDValue Src,
13060 SDValue MinVal,
13061 SDValue MaxVal,
13062 bool Signed) const {
13063
13064 // med3 comes from
13065 // min(max(x, K0), K1), K0 < K1
13066 // max(min(x, K0), K1), K1 < K0
13067 //
13068 // "MinVal" and "MaxVal" respectively refer to the rhs of the
13069 // min/max op.
13070 ConstantSDNode *MinK = dyn_cast<ConstantSDNode>(MinVal);
13071 ConstantSDNode *MaxK = dyn_cast<ConstantSDNode>(MaxVal);
13072
13073 if (!MinK || !MaxK)
13074 return SDValue();
13075
13076 if (Signed) {
13077 if (MaxK->getAPIntValue().sge(MinK->getAPIntValue()))
13078 return SDValue();
13079 } else {
13080 if (MaxK->getAPIntValue().uge(MinK->getAPIntValue()))
13081 return SDValue();
13082 }
13083
13084 EVT VT = MinK->getValueType(0);
13085 unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
13086 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
13087 return DAG.getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
13088
13089 // Note: we could also extend to i32 and use i32 med3 if i16 med3 is
13090 // not available, but this is unlikely to be profitable as constants
13091 // will often need to be materialized & extended, especially on
13092 // pre-GFX10 where VOP3 instructions couldn't take literal operands.
13093 return SDValue();
13094}
13095
13097 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op))
13098 return C;
13099
13100 if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op)) {
13101 if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
13102 return C;
13103 }
13104
13105 return nullptr;
13106}
13107
13108SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
13109 const SDLoc &SL,
13110 SDValue Op0,
13111 SDValue Op1) const {
13113 if (!K1)
13114 return SDValue();
13115
13117 if (!K0)
13118 return SDValue();
13119
13120 // Ordered >= (although NaN inputs should have folded away by now).
13121 if (K0->getValueAPF() > K1->getValueAPF())
13122 return SDValue();
13123
13124 const MachineFunction &MF = DAG.getMachineFunction();
13126
13127 // TODO: Check IEEE bit enabled?
13128 EVT VT = Op0.getValueType();
13129 if (Info->getMode().DX10Clamp) {
13130 // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
13131 // hardware fmed3 behavior converting to a min.
13132 // FIXME: Should this be allowing -0.0?
13133 if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0))
13134 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
13135 }
13136
13137 // med3 for f16 is only available on gfx9+, and not available for v2f16.
13138 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
13139 // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
13140 // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
13141 // then give the other result, which is different from med3 with a NaN
13142 // input.
13143 SDValue Var = Op0.getOperand(0);
13144 if (!DAG.isKnownNeverSNaN(Var))
13145 return SDValue();
13146
13148
13149 if ((!K0->hasOneUse() || TII->isInlineConstant(K0->getValueAPF())) &&
13150 (!K1->hasOneUse() || TII->isInlineConstant(K1->getValueAPF()))) {
13151 return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0),
13152 Var, SDValue(K0, 0), SDValue(K1, 0));
13153 }
13154 }
13155
13156 return SDValue();
13157}
13158
13159/// \return true if the subtarget supports minimum3 and maximum3 with the given
13160/// base min/max opcode \p Opc for type \p VT.
13161static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc,
13162 EVT VT) {
13163 switch (Opc) {
13164 case ISD::FMINNUM:
13165 case ISD::FMAXNUM:
13166 case ISD::FMINNUM_IEEE:
13167 case ISD::FMAXNUM_IEEE:
13170 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.hasMin3Max3_16());
13171 case ISD::FMINIMUM:
13172 case ISD::FMAXIMUM:
13173 return (VT == MVT::f32 || VT == MVT::f16) && Subtarget.hasIEEEMinMax3();
13174 case ISD::SMAX:
13175 case ISD::SMIN:
13176 case ISD::UMAX:
13177 case ISD::UMIN:
13178 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.hasMin3Max3_16());
13179 default:
13180 return false;
13181 }
13182
13183 llvm_unreachable("not a min/max opcode");
13184}
13185
13186SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
13187 DAGCombinerInfo &DCI) const {
13188 SelectionDAG &DAG = DCI.DAG;
13189
13190 EVT VT = N->getValueType(0);
13191 unsigned Opc = N->getOpcode();
13192 SDValue Op0 = N->getOperand(0);
13193 SDValue Op1 = N->getOperand(1);
13194
13195 // Only do this if the inner op has one use since this will just increases
13196 // register pressure for no benefit.
13197
13198 if (supportsMin3Max3(*Subtarget, Opc, VT)) {
13199 // max(max(a, b), c) -> max3(a, b, c)
13200 // min(min(a, b), c) -> min3(a, b, c)
13201 if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
13202 SDLoc DL(N);
13203 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
13204 DL,
13205 N->getValueType(0),
13206 Op0.getOperand(0),
13207 Op0.getOperand(1),
13208 Op1);
13209 }
13210
13211 // Try commuted.
13212 // max(a, max(b, c)) -> max3(a, b, c)
13213 // min(a, min(b, c)) -> min3(a, b, c)
13214 if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
13215 SDLoc DL(N);
13216 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
13217 DL,
13218 N->getValueType(0),
13219 Op0,
13220 Op1.getOperand(0),
13221 Op1.getOperand(1));
13222 }
13223 }
13224
13225 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
13226 // max(min(x, K0), K1), K1 < K0 -> med3(x, K1, K0)
13227 if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
13228 if (SDValue Med3 = performIntMed3ImmCombine(
13229 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), true))
13230 return Med3;
13231 }
13232 if (Opc == ISD::SMAX && Op0.getOpcode() == ISD::SMIN && Op0.hasOneUse()) {
13233 if (SDValue Med3 = performIntMed3ImmCombine(
13234 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, true))
13235 return Med3;
13236 }
13237
13238 if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
13239 if (SDValue Med3 = performIntMed3ImmCombine(
13240 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), false))
13241 return Med3;
13242 }
13243 if (Opc == ISD::UMAX && Op0.getOpcode() == ISD::UMIN && Op0.hasOneUse()) {
13244 if (SDValue Med3 = performIntMed3ImmCombine(
13245 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, false))
13246 return Med3;
13247 }
13248
13249 // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1)
13250 if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
13251 (Opc == ISD::FMINNUM_IEEE && Op0.getOpcode() == ISD::FMAXNUM_IEEE) ||
13252 (Opc == AMDGPUISD::FMIN_LEGACY &&
13253 Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
13254 (VT == MVT::f32 || VT == MVT::f64 ||
13255 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
13256 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
13257 Op0.hasOneUse()) {
13258 if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
13259 return Res;
13260 }
13261
13262 return SDValue();
13263}
13264
13266 if (ConstantFPSDNode *CA = dyn_cast<ConstantFPSDNode>(A)) {
13267 if (ConstantFPSDNode *CB = dyn_cast<ConstantFPSDNode>(B)) {
13268 // FIXME: Should this be allowing -0.0?
13269 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
13270 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
13271 }
13272 }
13273
13274 return false;
13275}
13276
13277// FIXME: Should only worry about snans for version with chain.
13278SDValue SITargetLowering::performFMed3Combine(SDNode *N,
13279 DAGCombinerInfo &DCI) const {
13280 EVT VT = N->getValueType(0);
13281 // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
13282 // NaNs. With a NaN input, the order of the operands may change the result.
13283
13284 SelectionDAG &DAG = DCI.DAG;
13285 SDLoc SL(N);
13286
13287 SDValue Src0 = N->getOperand(0);
13288 SDValue Src1 = N->getOperand(1);
13289 SDValue Src2 = N->getOperand(2);
13290
13291 if (isClampZeroToOne(Src0, Src1)) {
13292 // const_a, const_b, x -> clamp is safe in all cases including signaling
13293 // nans.
13294 // FIXME: Should this be allowing -0.0?
13295 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
13296 }
13297
13298 const MachineFunction &MF = DAG.getMachineFunction();
13300
13301 // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
13302 // handling no dx10-clamp?
13303 if (Info->getMode().DX10Clamp) {
13304 // If NaNs is clamped to 0, we are free to reorder the inputs.
13305
13306 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
13307 std::swap(Src0, Src1);
13308
13309 if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
13310 std::swap(Src1, Src2);
13311
13312 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
13313 std::swap(Src0, Src1);
13314
13315 if (isClampZeroToOne(Src1, Src2))
13316 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
13317 }
13318
13319 return SDValue();
13320}
13321
13322SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
13323 DAGCombinerInfo &DCI) const {
13324 SDValue Src0 = N->getOperand(0);
13325 SDValue Src1 = N->getOperand(1);
13326 if (Src0.isUndef() && Src1.isUndef())
13327 return DCI.DAG.getUNDEF(N->getValueType(0));
13328 return SDValue();
13329}
13330
13331// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
13332// expanded into a set of cmp/select instructions.
13334 unsigned NumElem,
13335 bool IsDivergentIdx,
13336 const GCNSubtarget *Subtarget) {
13338 return false;
13339
13340 unsigned VecSize = EltSize * NumElem;
13341
13342 // Sub-dword vectors of size 2 dword or less have better implementation.
13343 if (VecSize <= 64 && EltSize < 32)
13344 return false;
13345
13346 // Always expand the rest of sub-dword instructions, otherwise it will be
13347 // lowered via memory.
13348 if (EltSize < 32)
13349 return true;
13350
13351 // Always do this if var-idx is divergent, otherwise it will become a loop.
13352 if (IsDivergentIdx)
13353 return true;
13354
13355 // Large vectors would yield too many compares and v_cndmask_b32 instructions.
13356 unsigned NumInsts = NumElem /* Number of compares */ +
13357 ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */;
13358
13359 // On some architectures (GFX9) movrel is not available and it's better
13360 // to expand.
13361 if (!Subtarget->hasMovrel())
13362 return NumInsts <= 16;
13363
13364 // If movrel is available, use it instead of expanding for vector of 8
13365 // elements.
13366 return NumInsts <= 15;
13367}
13368
13370 SDValue Idx = N->getOperand(N->getNumOperands() - 1);
13371 if (isa<ConstantSDNode>(Idx))
13372 return false;
13373
13374 SDValue Vec = N->getOperand(0);
13375 EVT VecVT = Vec.getValueType();
13376 EVT EltVT = VecVT.getVectorElementType();
13377 unsigned EltSize = EltVT.getSizeInBits();
13378 unsigned NumElem = VecVT.getVectorNumElements();
13379
13381 EltSize, NumElem, Idx->isDivergent(), getSubtarget());
13382}
13383
13384SDValue SITargetLowering::performExtractVectorEltCombine(
13385 SDNode *N, DAGCombinerInfo &DCI) const {
13386 SDValue Vec = N->getOperand(0);
13387 SelectionDAG &DAG = DCI.DAG;
13388
13389 EVT VecVT = Vec.getValueType();
13390 EVT VecEltVT = VecVT.getVectorElementType();
13391 EVT ResVT = N->getValueType(0);
13392
13393 unsigned VecSize = VecVT.getSizeInBits();
13394 unsigned VecEltSize = VecEltVT.getSizeInBits();
13395
13396 if ((Vec.getOpcode() == ISD::FNEG ||
13398 SDLoc SL(N);
13399 SDValue Idx = N->getOperand(1);
13400 SDValue Elt =
13401 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec.getOperand(0), Idx);
13402 return DAG.getNode(Vec.getOpcode(), SL, ResVT, Elt);
13403 }
13404
13405 // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
13406 // =>
13407 // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
13408 // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
13409 // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
13410 if (Vec.hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
13411 SDLoc SL(N);
13412 SDValue Idx = N->getOperand(1);
13413 unsigned Opc = Vec.getOpcode();
13414
13415 switch(Opc) {
13416 default:
13417 break;
13418 // TODO: Support other binary operations.
13419 case ISD::FADD:
13420 case ISD::FSUB:
13421 case ISD::FMUL:
13422 case ISD::ADD:
13423 case ISD::UMIN:
13424 case ISD::UMAX:
13425 case ISD::SMIN:
13426 case ISD::SMAX:
13427 case ISD::FMAXNUM:
13428 case ISD::FMINNUM:
13429 case ISD::FMAXNUM_IEEE:
13430 case ISD::FMINNUM_IEEE:
13431 case ISD::FMAXIMUM:
13432 case ISD::FMINIMUM: {
13433 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
13434 Vec.getOperand(0), Idx);
13435 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
13436 Vec.getOperand(1), Idx);
13437
13438 DCI.AddToWorklist(Elt0.getNode());
13439 DCI.AddToWorklist(Elt1.getNode());
13440 return DAG.getNode(Opc, SL, ResVT, Elt0, Elt1, Vec->getFlags());
13441 }
13442 }
13443 }
13444
13445 // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
13447 SDLoc SL(N);
13448 SDValue Idx = N->getOperand(1);
13449 SDValue V;
13450 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
13451 SDValue IC = DAG.getVectorIdxConstant(I, SL);
13452 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec, IC);
13453 if (I == 0)
13454 V = Elt;
13455 else
13456 V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ);
13457 }
13458 return V;
13459 }
13460
13461 if (!DCI.isBeforeLegalize())
13462 return SDValue();
13463
13464 // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
13465 // elements. This exposes more load reduction opportunities by replacing
13466 // multiple small extract_vector_elements with a single 32-bit extract.
13467 auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
13468 if (isa<MemSDNode>(Vec) && VecEltSize <= 16 && VecEltVT.isByteSized() &&
13469 VecSize > 32 && VecSize % 32 == 0 && Idx) {
13470 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT);
13471
13472 unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
13473 unsigned EltIdx = BitIndex / 32;
13474 unsigned LeftoverBitIdx = BitIndex % 32;
13475 SDLoc SL(N);
13476
13477 SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec);
13478 DCI.AddToWorklist(Cast.getNode());
13479
13480 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast,
13481 DAG.getConstant(EltIdx, SL, MVT::i32));
13482 DCI.AddToWorklist(Elt.getNode());
13483 SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt,
13484 DAG.getConstant(LeftoverBitIdx, SL, MVT::i32));
13485 DCI.AddToWorklist(Srl.getNode());
13486
13487 EVT VecEltAsIntVT = VecEltVT.changeTypeToInteger();
13488 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VecEltAsIntVT, Srl);
13489 DCI.AddToWorklist(Trunc.getNode());
13490
13491 if (VecEltVT == ResVT) {
13492 return DAG.getNode(ISD::BITCAST, SL, VecEltVT, Trunc);
13493 }
13494
13495 assert(ResVT.isScalarInteger());
13496 return DAG.getAnyExtOrTrunc(Trunc, SL, ResVT);
13497 }
13498
13499 return SDValue();
13500}
13501
13502SDValue
13503SITargetLowering::performInsertVectorEltCombine(SDNode *N,
13504 DAGCombinerInfo &DCI) const {
13505 SDValue Vec = N->getOperand(0);
13506 SDValue Idx = N->getOperand(2);
13507 EVT VecVT = Vec.getValueType();
13508 EVT EltVT = VecVT.getVectorElementType();
13509
13510 // INSERT_VECTOR_ELT (<n x e>, var-idx)
13511 // => BUILD_VECTOR n x select (e, const-idx)
13513 return SDValue();
13514
13515 SelectionDAG &DAG = DCI.DAG;
13516 SDLoc SL(N);
13517 SDValue Ins = N->getOperand(1);
13518 EVT IdxVT = Idx.getValueType();
13519
13521 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
13522 SDValue IC = DAG.getConstant(I, SL, IdxVT);
13523 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
13524 SDValue V = DAG.getSelectCC(SL, Idx, IC, Ins, Elt, ISD::SETEQ);
13525 Ops.push_back(V);
13526 }
13527
13528 return DAG.getBuildVector(VecVT, SL, Ops);
13529}
13530
13531/// Return the source of an fp_extend from f16 to f32, or a converted FP
13532/// constant.
13534 if (Src.getOpcode() == ISD::FP_EXTEND &&
13535 Src.getOperand(0).getValueType() == MVT::f16) {
13536 return Src.getOperand(0);
13537 }
13538
13539 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Src)) {
13540 APFloat Val = CFP->getValueAPF();
13541 bool LosesInfo = true;
13543 if (!LosesInfo)
13544 return DAG.getConstantFP(Val, SDLoc(Src), MVT::f16);
13545 }
13546
13547 return SDValue();
13548}
13549
13550SDValue SITargetLowering::performFPRoundCombine(SDNode *N,
13551 DAGCombinerInfo &DCI) const {
13552 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
13553 "combine only useful on gfx8");
13554
13555 SDValue TruncSrc = N->getOperand(0);
13556 EVT VT = N->getValueType(0);
13557 if (VT != MVT::f16)
13558 return SDValue();
13559
13560 if (TruncSrc.getOpcode() != AMDGPUISD::FMED3 ||
13561 TruncSrc.getValueType() != MVT::f32 || !TruncSrc.hasOneUse())
13562 return SDValue();
13563
13564 SelectionDAG &DAG = DCI.DAG;
13565 SDLoc SL(N);
13566
13567 // Optimize f16 fmed3 pattern performed on f32. On gfx8 there is no f16 fmed3,
13568 // and expanding it with min/max saves 1 instruction vs. casting to f32 and
13569 // casting back.
13570
13571 // fptrunc (f32 (fmed3 (fpext f16:a, fpext f16:b, fpext f16:c))) =>
13572 // fmin(fmax(a, b), fmax(fmin(a, b), c))
13573 SDValue A = strictFPExtFromF16(DAG, TruncSrc.getOperand(0));
13574 if (!A)
13575 return SDValue();
13576
13577 SDValue B = strictFPExtFromF16(DAG, TruncSrc.getOperand(1));
13578 if (!B)
13579 return SDValue();
13580
13581 SDValue C = strictFPExtFromF16(DAG, TruncSrc.getOperand(2));
13582 if (!C)
13583 return SDValue();
13584
13585 // This changes signaling nan behavior. If an input is a signaling nan, it
13586 // would have been quieted by the fpext originally. We don't care because
13587 // these are unconstrained ops. If we needed to insert quieting canonicalizes
13588 // we would be worse off than just doing the promotion.
13589 SDValue A1 = DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, A, B);
13590 SDValue B1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A, B);
13591 SDValue C1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A1, C);
13592 return DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, B1, C1);
13593}
13594
13595unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
13596 const SDNode *N0,
13597 const SDNode *N1) const {
13598 EVT VT = N0->getValueType(0);
13599
13600 // Only do this if we are not trying to support denormals. v_mad_f32 does not
13601 // support denormals ever.
13602 if (((VT == MVT::f32 &&
13604 (VT == MVT::f16 && Subtarget->hasMadF16() &&
13607 return ISD::FMAD;
13608
13609 const TargetOptions &Options = DAG.getTarget().Options;
13610 if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
13611 (N0->getFlags().hasAllowContract() &&
13612 N1->getFlags().hasAllowContract())) &&
13614 return ISD::FMA;
13615 }
13616
13617 return 0;
13618}
13619
13620// For a reassociatable opcode perform:
13621// op x, (op y, z) -> op (op x, z), y, if x and z are uniform
13622SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
13623 SelectionDAG &DAG) const {
13624 EVT VT = N->getValueType(0);
13625 if (VT != MVT::i32 && VT != MVT::i64)
13626 return SDValue();
13627
13628 if (DAG.isBaseWithConstantOffset(SDValue(N, 0)))
13629 return SDValue();
13630
13631 unsigned Opc = N->getOpcode();
13632 SDValue Op0 = N->getOperand(0);
13633 SDValue Op1 = N->getOperand(1);
13634
13635 if (!(Op0->isDivergent() ^ Op1->isDivergent()))
13636 return SDValue();
13637
13638 if (Op0->isDivergent())
13639 std::swap(Op0, Op1);
13640
13641 if (Op1.getOpcode() != Opc || !Op1.hasOneUse())
13642 return SDValue();
13643
13644 SDValue Op2 = Op1.getOperand(1);
13645 Op1 = Op1.getOperand(0);
13646 if (!(Op1->isDivergent() ^ Op2->isDivergent()))
13647 return SDValue();
13648
13649 if (Op1->isDivergent())
13650 std::swap(Op1, Op2);
13651
13652 SDLoc SL(N);
13653 SDValue Add1 = DAG.getNode(Opc, SL, VT, Op0, Op1);
13654 return DAG.getNode(Opc, SL, VT, Add1, Op2);
13655}
13656
13657static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL,
13658 EVT VT,
13659 SDValue N0, SDValue N1, SDValue N2,
13660 bool Signed) {
13662 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1);
13663 SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2);
13664 return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
13665}
13666
13667// Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high
13668// multiplies, if any.
13669//
13670// Full 64-bit multiplies that feed into an addition are lowered here instead
13671// of using the generic expansion. The generic expansion ends up with
13672// a tree of ADD nodes that prevents us from using the "add" part of the
13673// MAD instruction. The expansion produced here results in a chain of ADDs
13674// instead of a tree.
13675SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
13676 DAGCombinerInfo &DCI) const {
13677 assert(N->getOpcode() == ISD::ADD);
13678
13679 SelectionDAG &DAG = DCI.DAG;
13680 EVT VT = N->getValueType(0);
13681 SDLoc SL(N);
13682 SDValue LHS = N->getOperand(0);
13683 SDValue RHS = N->getOperand(1);
13684
13685 if (VT.isVector())
13686 return SDValue();
13687
13688 // S_MUL_HI_[IU]32 was added in gfx9, which allows us to keep the overall
13689 // result in scalar registers for uniform values.
13690 if (!N->isDivergent() && Subtarget->hasSMulHi())
13691 return SDValue();
13692
13693 unsigned NumBits = VT.getScalarSizeInBits();
13694 if (NumBits <= 32 || NumBits > 64)
13695 return SDValue();
13696
13697 if (LHS.getOpcode() != ISD::MUL) {
13698 assert(RHS.getOpcode() == ISD::MUL);
13699 std::swap(LHS, RHS);
13700 }
13701
13702 // Avoid the fold if it would unduly increase the number of multiplies due to
13703 // multiple uses, except on hardware with full-rate multiply-add (which is
13704 // part of full-rate 64-bit ops).
13705 if (!Subtarget->hasFullRate64Ops()) {
13706 unsigned NumUsers = 0;
13707 for (SDNode *Use : LHS->uses()) {
13708 // There is a use that does not feed into addition, so the multiply can't
13709 // be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
13710 if (Use->getOpcode() != ISD::ADD)
13711 return SDValue();
13712
13713 // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
13714 // MUL + 3xADD + 3xADDC over 3xMAD.
13715 ++NumUsers;
13716 if (NumUsers >= 3)
13717 return SDValue();
13718 }
13719 }
13720
13721 SDValue MulLHS = LHS.getOperand(0);
13722 SDValue MulRHS = LHS.getOperand(1);
13723 SDValue AddRHS = RHS;
13724
13725 // Always check whether operands are small unsigned values, since that
13726 // knowledge is useful in more cases. Check for small signed values only if
13727 // doing so can unlock a shorter code sequence.
13728 bool MulLHSUnsigned32 = numBitsUnsigned(MulLHS, DAG) <= 32;
13729 bool MulRHSUnsigned32 = numBitsUnsigned(MulRHS, DAG) <= 32;
13730
13731 bool MulSignedLo = false;
13732 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
13733 MulSignedLo = numBitsSigned(MulLHS, DAG) <= 32 &&
13734 numBitsSigned(MulRHS, DAG) <= 32;
13735 }
13736
13737 // The operands and final result all have the same number of bits. If
13738 // operands need to be extended, they can be extended with garbage. The
13739 // resulting garbage in the high bits of the mad_[iu]64_[iu]32 result is
13740 // truncated away in the end.
13741 if (VT != MVT::i64) {
13742 MulLHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulLHS);
13743 MulRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulRHS);
13744 AddRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, AddRHS);
13745 }
13746
13747 // The basic code generated is conceptually straightforward. Pseudo code:
13748 //
13749 // accum = mad_64_32 lhs.lo, rhs.lo, accum
13750 // accum.hi = add (mul lhs.hi, rhs.lo), accum.hi
13751 // accum.hi = add (mul lhs.lo, rhs.hi), accum.hi
13752 //
13753 // The second and third lines are optional, depending on whether the factors
13754 // are {sign,zero}-extended or not.
13755 //
13756 // The actual DAG is noisier than the pseudo code, but only due to
13757 // instructions that disassemble values into low and high parts, and
13758 // assemble the final result.
13759 SDValue One = DAG.getConstant(1, SL, MVT::i32);
13760
13761 auto MulLHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS);
13762 auto MulRHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulRHS);
13763 SDValue Accum =
13764 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
13765
13766 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
13767 SDValue AccumLo, AccumHi;
13768 std::tie(AccumLo, AccumHi) = DAG.SplitScalar(Accum, SL, MVT::i32, MVT::i32);
13769
13770 if (!MulLHSUnsigned32) {
13771 auto MulLHSHi =
13772 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulLHS, One);
13773 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSHi, MulRHSLo);
13774 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
13775 }
13776
13777 if (!MulRHSUnsigned32) {
13778 auto MulRHSHi =
13779 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulRHS, One);
13780 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSLo, MulRHSHi);
13781 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
13782 }
13783
13784 Accum = DAG.getBuildVector(MVT::v2i32, SL, {AccumLo, AccumHi});
13785 Accum = DAG.getBitcast(MVT::i64, Accum);
13786 }
13787
13788 if (VT != MVT::i64)
13789 Accum = DAG.getNode(ISD::TRUNCATE, SL, VT, Accum);
13790 return Accum;
13791}
13792
13793// Collect the ultimate src of each of the mul node's operands, and confirm
13794// each operand is 8 bytes.
13795static std::optional<ByteProvider<SDValue>>
13796handleMulOperand(const SDValue &MulOperand) {
13797 auto Byte0 = calculateByteProvider(MulOperand, 0, 0);
13798 if (!Byte0 || Byte0->isConstantZero()) {
13799 return std::nullopt;
13800 }
13801 auto Byte1 = calculateByteProvider(MulOperand, 1, 0);
13802 if (Byte1 && !Byte1->isConstantZero()) {
13803 return std::nullopt;
13804 }
13805 return Byte0;
13806}
13807
13808static unsigned addPermMasks(unsigned First, unsigned Second) {
13809 unsigned FirstCs = First & 0x0c0c0c0c;
13810 unsigned SecondCs = Second & 0x0c0c0c0c;
13811 unsigned FirstNoCs = First & ~0x0c0c0c0c;
13812 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
13813
13814 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
13815 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
13816 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
13817 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
13818
13819 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
13820}
13821
13822struct DotSrc {
13824 int64_t PermMask;
13826};
13827
13831 SmallVectorImpl<DotSrc> &Src1s, int Step) {
13832
13833 assert(Src0.Src.has_value() && Src1.Src.has_value());
13834 // Src0s and Src1s are empty, just place arbitrarily.
13835 if (Step == 0) {
13836 Src0s.push_back({*Src0.Src, ((Src0.SrcOffset % 4) << 24) + 0x0c0c0c,
13837 Src0.SrcOffset / 4});
13838 Src1s.push_back({*Src1.Src, ((Src1.SrcOffset % 4) << 24) + 0x0c0c0c,
13839 Src1.SrcOffset / 4});
13840 return;
13841 }
13842
13843 for (int BPI = 0; BPI < 2; BPI++) {
13844 std::pair<ByteProvider<SDValue>, ByteProvider<SDValue>> BPP = {Src0, Src1};
13845 if (BPI == 1) {
13846 BPP = {Src1, Src0};
13847 }
13848 unsigned ZeroMask = 0x0c0c0c0c;
13849 unsigned FMask = 0xFF << (8 * (3 - Step));
13850
13851 unsigned FirstMask =
13852 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
13853 unsigned SecondMask =
13854 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
13855 // Attempt to find Src vector which contains our SDValue, if so, add our
13856 // perm mask to the existing one. If we are unable to find a match for the
13857 // first SDValue, attempt to find match for the second.
13858 int FirstGroup = -1;
13859 for (int I = 0; I < 2; I++) {
13860 SmallVectorImpl<DotSrc> &Srcs = I == 0 ? Src0s : Src1s;
13861 auto MatchesFirst = [&BPP](DotSrc &IterElt) {
13862 return IterElt.SrcOp == *BPP.first.Src &&
13863 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
13864 };
13865
13866 auto Match = llvm::find_if(Srcs, MatchesFirst);
13867 if (Match != Srcs.end()) {
13868 Match->PermMask = addPermMasks(FirstMask, Match->PermMask);
13869 FirstGroup = I;
13870 break;
13871 }
13872 }
13873 if (FirstGroup != -1) {
13874 SmallVectorImpl<DotSrc> &Srcs = FirstGroup == 1 ? Src0s : Src1s;
13875 auto MatchesSecond = [&BPP](DotSrc &IterElt) {
13876 return IterElt.SrcOp == *BPP.second.Src &&
13877 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
13878 };
13879 auto Match = llvm::find_if(Srcs, MatchesSecond);
13880 if (Match != Srcs.end()) {
13881 Match->PermMask = addPermMasks(SecondMask, Match->PermMask);
13882 } else
13883 Srcs.push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
13884 return;
13885 }
13886 }
13887
13888 // If we have made it here, then we could not find a match in Src0s or Src1s
13889 // for either Src0 or Src1, so just place them arbitrarily.
13890
13891 unsigned ZeroMask = 0x0c0c0c0c;
13892 unsigned FMask = 0xFF << (8 * (3 - Step));
13893
13894 Src0s.push_back(
13895 {*Src0.Src,
13896 ((Src0.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
13897 Src1.SrcOffset / 4});
13898 Src1s.push_back(
13899 {*Src1.Src,
13900 ((Src1.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
13901 Src1.SrcOffset / 4});
13902
13903 return;
13904}
13905
13907 SmallVectorImpl<DotSrc> &Srcs, bool IsSigned,
13908 bool IsAny) {
13909
13910 // If we just have one source, just permute it accordingly.
13911 if (Srcs.size() == 1) {
13912 auto Elt = Srcs.begin();
13913 auto EltOp = getDWordFromOffset(DAG, SL, Elt->SrcOp, Elt->DWordOffset);
13914
13915 // v_perm will produce the original value
13916 if (Elt->PermMask == 0x3020100)
13917 return EltOp;
13918
13919 return DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
13920 DAG.getConstant(Elt->PermMask, SL, MVT::i32));
13921 }
13922
13923 auto FirstElt = Srcs.begin();
13924 auto SecondElt = std::next(FirstElt);
13925
13927
13928 // If we have multiple sources in the chain, combine them via perms (using
13929 // calculated perm mask) and Ors.
13930 while (true) {
13931 auto FirstMask = FirstElt->PermMask;
13932 auto SecondMask = SecondElt->PermMask;
13933
13934 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
13935 unsigned FirstPlusFour = FirstMask | 0x04040404;
13936 // 0x0c + 0x04 = 0x10, so anding with 0x0F will produced 0x00 for any
13937 // original 0x0C.
13938 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
13939
13940 auto PermMask = addPermMasks(FirstMask, SecondMask);
13941 auto FirstVal =
13942 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
13943 auto SecondVal =
13944 getDWordFromOffset(DAG, SL, SecondElt->SrcOp, SecondElt->DWordOffset);
13945
13946 Perms.push_back(DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, FirstVal,
13947 SecondVal,
13948 DAG.getConstant(PermMask, SL, MVT::i32)));
13949
13950 FirstElt = std::next(SecondElt);
13951 if (FirstElt == Srcs.end())
13952 break;
13953
13954 SecondElt = std::next(FirstElt);
13955 // If we only have a FirstElt, then just combine that into the cumulative
13956 // source node.
13957 if (SecondElt == Srcs.end()) {
13958 auto EltOp =
13959 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
13960
13961 Perms.push_back(
13962 DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
13963 DAG.getConstant(FirstElt->PermMask, SL, MVT::i32)));
13964 break;
13965 }
13966 }
13967
13968 assert(Perms.size() == 1 || Perms.size() == 2);
13969 return Perms.size() == 2
13970 ? DAG.getNode(ISD::OR, SL, MVT::i32, Perms[0], Perms[1])
13971 : Perms[0];
13972}
13973
13974static void fixMasks(SmallVectorImpl<DotSrc> &Srcs, unsigned ChainLength) {
13975 for (auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
13976 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
13977 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
13978 EntryMask += ZeroMask;
13979 }
13980}
13981
13982static bool isMul(const SDValue Op) {
13983 auto Opcode = Op.getOpcode();
13984
13985 return (Opcode == ISD::MUL || Opcode == AMDGPUISD::MUL_U24 ||
13986 Opcode == AMDGPUISD::MUL_I24);
13987}
13988
13989static std::optional<bool>
13991 ByteProvider<SDValue> &Src1, const SDValue &S0Op,
13992 const SDValue &S1Op, const SelectionDAG &DAG) {
13993 // If we both ops are i8s (pre legalize-dag), then the signedness semantics
13994 // of the dot4 is irrelevant.
13995 if (S0Op.getValueSizeInBits() == 8 && S1Op.getValueSizeInBits() == 8)
13996 return false;
13997
13998 auto Known0 = DAG.computeKnownBits(S0Op, 0);
13999 bool S0IsUnsigned = Known0.countMinLeadingZeros() > 0;
14000 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
14001 auto Known1 = DAG.computeKnownBits(S1Op, 0);
14002 bool S1IsUnsigned = Known1.countMinLeadingZeros() > 0;
14003 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
14004
14005 assert(!(S0IsUnsigned && S0IsSigned));
14006 assert(!(S1IsUnsigned && S1IsSigned));
14007
14008 // There are 9 possible permutations of
14009 // {S0IsUnsigned, S0IsSigned, S1IsUnsigned, S1IsSigned}
14010
14011 // In two permutations, the sign bits are known to be the same for both Ops,
14012 // so simply return Signed / Unsigned corresponding to the MSB
14013
14014 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
14015 return S0IsSigned;
14016
14017 // In another two permutations, the sign bits are known to be opposite. In
14018 // this case return std::nullopt to indicate a bad match.
14019
14020 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
14021 return std::nullopt;
14022
14023 // In the remaining five permutations, we don't know the value of the sign
14024 // bit for at least one Op. Since we have a valid ByteProvider, we know that
14025 // the upper bits must be extension bits. Thus, the only ways for the sign
14026 // bit to be unknown is if it was sign extended from unknown value, or if it
14027 // was any extended. In either case, it is correct to use the signed
14028 // version of the signedness semantics of dot4
14029
14030 // In two of such permutations, we known the sign bit is set for
14031 // one op, and the other is unknown. It is okay to used signed version of
14032 // dot4.
14033 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
14034 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
14035 return true;
14036
14037 // In one such permutation, we don't know either of the sign bits. It is okay
14038 // to used the signed version of dot4.
14039 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
14040 return true;
14041
14042 // In two of such permutations, we known the sign bit is unset for
14043 // one op, and the other is unknown. Return std::nullopt to indicate a
14044 // bad match.
14045 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
14046 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
14047 return std::nullopt;
14048
14049 llvm_unreachable("Fully covered condition");
14050}
14051
14052SDValue SITargetLowering::performAddCombine(SDNode *N,
14053 DAGCombinerInfo &DCI) const {
14054 SelectionDAG &DAG = DCI.DAG;
14055 EVT VT = N->getValueType(0);
14056 SDLoc SL(N);
14057 SDValue LHS = N->getOperand(0);
14058 SDValue RHS = N->getOperand(1);
14059
14060 if (LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL) {
14061 if (Subtarget->hasMad64_32()) {
14062 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
14063 return Folded;
14064 }
14065 }
14066
14067 if (SDValue V = reassociateScalarOps(N, DAG)) {
14068 return V;
14069 }
14070
14071 if ((isMul(LHS) || isMul(RHS)) && Subtarget->hasDot7Insts() &&
14072 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
14073 SDValue TempNode(N, 0);
14074 std::optional<bool> IsSigned;
14078
14079 // Match the v_dot4 tree, while collecting src nodes.
14080 int ChainLength = 0;
14081 for (int I = 0; I < 4; I++) {
14082 auto MulIdx = isMul(LHS) ? 0 : isMul(RHS) ? 1 : -1;
14083 if (MulIdx == -1)
14084 break;
14085 auto Src0 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
14086 if (!Src0)
14087 break;
14088 auto Src1 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
14089 if (!Src1)
14090 break;
14091
14092 auto IterIsSigned = checkDot4MulSignedness(
14093 TempNode->getOperand(MulIdx), *Src0, *Src1,
14094 TempNode->getOperand(MulIdx)->getOperand(0),
14095 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
14096 if (!IterIsSigned)
14097 break;
14098 if (!IsSigned)
14099 IsSigned = *IterIsSigned;
14100 if (*IterIsSigned != *IsSigned)
14101 break;
14102 placeSources(*Src0, *Src1, Src0s, Src1s, I);
14103 auto AddIdx = 1 - MulIdx;
14104 // Allow the special case where add (add (mul24, 0), mul24) became ->
14105 // add (mul24, mul24).
14106 if (I == 2 && isMul(TempNode->getOperand(AddIdx))) {
14107 Src2s.push_back(TempNode->getOperand(AddIdx));
14108 auto Src0 =
14109 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(0));
14110 if (!Src0)
14111 break;
14112 auto Src1 =
14113 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(1));
14114 if (!Src1)
14115 break;
14116 auto IterIsSigned = checkDot4MulSignedness(
14117 TempNode->getOperand(AddIdx), *Src0, *Src1,
14118 TempNode->getOperand(AddIdx)->getOperand(0),
14119 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
14120 if (!IterIsSigned)
14121 break;
14122 assert(IsSigned);
14123 if (*IterIsSigned != *IsSigned)
14124 break;
14125 placeSources(*Src0, *Src1, Src0s, Src1s, I + 1);
14126 Src2s.push_back(DAG.getConstant(0, SL, MVT::i32));
14127 ChainLength = I + 2;
14128 break;
14129 }
14130
14131 TempNode = TempNode->getOperand(AddIdx);
14132 Src2s.push_back(TempNode);
14133 ChainLength = I + 1;
14134 if (TempNode->getNumOperands() < 2)
14135 break;
14136 LHS = TempNode->getOperand(0);
14137 RHS = TempNode->getOperand(1);
14138 }
14139
14140 if (ChainLength < 2)
14141 return SDValue();
14142
14143 // Masks were constructed with assumption that we would find a chain of
14144 // length 4. If not, then we need to 0 out the MSB bits (via perm mask of
14145 // 0x0c) so they do not affect dot calculation.
14146 if (ChainLength < 4) {
14147 fixMasks(Src0s, ChainLength);
14148 fixMasks(Src1s, ChainLength);
14149 }
14150
14151 SDValue Src0, Src1;
14152
14153 // If we are just using a single source for both, and have permuted the
14154 // bytes consistently, we can just use the sources without permuting
14155 // (commutation).
14156 bool UseOriginalSrc = false;
14157 if (ChainLength == 4 && Src0s.size() == 1 && Src1s.size() == 1 &&
14158 Src0s.begin()->PermMask == Src1s.begin()->PermMask &&
14159 Src0s.begin()->SrcOp.getValueSizeInBits() >= 32 &&
14160 Src1s.begin()->SrcOp.getValueSizeInBits() >= 32) {
14161 SmallVector<unsigned, 4> SrcBytes;
14162 auto Src0Mask = Src0s.begin()->PermMask;
14163 SrcBytes.push_back(Src0Mask & 0xFF000000);
14164 bool UniqueEntries = true;
14165 for (auto I = 1; I < 4; I++) {
14166 auto NextByte = Src0Mask & (0xFF << ((3 - I) * 8));
14167
14168 if (is_contained(SrcBytes, NextByte)) {
14169 UniqueEntries = false;
14170 break;
14171 }
14172 SrcBytes.push_back(NextByte);
14173 }
14174
14175 if (UniqueEntries) {
14176 UseOriginalSrc = true;
14177
14178 auto FirstElt = Src0s.begin();
14179 auto FirstEltOp =
14180 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
14181
14182 auto SecondElt = Src1s.begin();
14183 auto SecondEltOp = getDWordFromOffset(DAG, SL, SecondElt->SrcOp,
14184 SecondElt->DWordOffset);
14185
14186 Src0 = DAG.getBitcastedAnyExtOrTrunc(FirstEltOp, SL,
14187 MVT::getIntegerVT(32));
14188 Src1 = DAG.getBitcastedAnyExtOrTrunc(SecondEltOp, SL,
14189 MVT::getIntegerVT(32));
14190 }
14191 }
14192
14193 if (!UseOriginalSrc) {
14194 Src0 = resolveSources(DAG, SL, Src0s, false, true);
14195 Src1 = resolveSources(DAG, SL, Src1s, false, true);
14196 }
14197
14198 assert(IsSigned);
14199 SDValue Src2 =
14200 DAG.getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
14201
14202 SDValue IID = DAG.getTargetConstant(*IsSigned ? Intrinsic::amdgcn_sdot4
14203 : Intrinsic::amdgcn_udot4,
14204 SL, MVT::i64);
14205
14206 assert(!VT.isVector());
14207 auto Dot = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, IID, Src0,
14208 Src1, Src2, DAG.getTargetConstant(0, SL, MVT::i1));
14209
14210 return DAG.getExtOrTrunc(*IsSigned, Dot, SL, VT);
14211 }
14212
14213 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
14214 return SDValue();
14215
14216 // add x, zext (setcc) => uaddo_carry x, 0, setcc
14217 // add x, sext (setcc) => usubo_carry x, 0, setcc
14218 unsigned Opc = LHS.getOpcode();
14219 if (Opc == ISD::ZERO_EXTEND || Opc == ISD::SIGN_EXTEND ||
14220 Opc == ISD::ANY_EXTEND || Opc == ISD::UADDO_CARRY)
14221 std::swap(RHS, LHS);
14222
14223 Opc = RHS.getOpcode();
14224 switch (Opc) {
14225 default: break;
14226 case ISD::ZERO_EXTEND:
14227 case ISD::SIGN_EXTEND:
14228 case ISD::ANY_EXTEND: {
14229 auto Cond = RHS.getOperand(0);
14230 // If this won't be a real VOPC output, we would still need to insert an
14231 // extra instruction anyway.
14232 if (!isBoolSGPR(Cond))
14233 break;
14234 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
14235 SDValue Args[] = { LHS, DAG.getConstant(0, SL, MVT::i32), Cond };
14237 return DAG.getNode(Opc, SL, VTList, Args);
14238 }
14239 case ISD::UADDO_CARRY: {
14240 // add x, (uaddo_carry y, 0, cc) => uaddo_carry x, y, cc
14241 if (!isNullConstant(RHS.getOperand(1)))
14242 break;
14243 SDValue Args[] = { LHS, RHS.getOperand(0), RHS.getOperand(2) };
14244 return DAG.getNode(ISD::UADDO_CARRY, SDLoc(N), RHS->getVTList(), Args);
14245 }
14246 }
14247 return SDValue();
14248}
14249
14250SDValue SITargetLowering::performSubCombine(SDNode *N,
14251 DAGCombinerInfo &DCI) const {
14252 SelectionDAG &DAG = DCI.DAG;
14253 EVT VT = N->getValueType(0);
14254
14255 if (VT != MVT::i32)
14256 return SDValue();
14257
14258 SDLoc SL(N);
14259 SDValue LHS = N->getOperand(0);
14260 SDValue RHS = N->getOperand(1);
14261
14262 // sub x, zext (setcc) => usubo_carry x, 0, setcc
14263 // sub x, sext (setcc) => uaddo_carry x, 0, setcc
14264 unsigned Opc = RHS.getOpcode();
14265 switch (Opc) {
14266 default: break;
14267 case ISD::ZERO_EXTEND:
14268 case ISD::SIGN_EXTEND:
14269 case ISD::ANY_EXTEND: {
14270 auto Cond = RHS.getOperand(0);
14271 // If this won't be a real VOPC output, we would still need to insert an
14272 // extra instruction anyway.
14273 if (!isBoolSGPR(Cond))
14274 break;
14275 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
14276 SDValue Args[] = { LHS, DAG.getConstant(0, SL, MVT::i32), Cond };
14278 return DAG.getNode(Opc, SL, VTList, Args);
14279 }
14280 }
14281
14282 if (LHS.getOpcode() == ISD::USUBO_CARRY) {
14283 // sub (usubo_carry x, 0, cc), y => usubo_carry x, y, cc
14284 if (!isNullConstant(LHS.getOperand(1)))
14285 return SDValue();
14286 SDValue Args[] = { LHS.getOperand(0), RHS, LHS.getOperand(2) };
14287 return DAG.getNode(ISD::USUBO_CARRY, SDLoc(N), LHS->getVTList(), Args);
14288 }
14289 return SDValue();
14290}
14291
14292SDValue SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
14293 DAGCombinerInfo &DCI) const {
14294
14295 if (N->getValueType(0) != MVT::i32)
14296 return SDValue();
14297
14298 if (!isNullConstant(N->getOperand(1)))
14299 return SDValue();
14300
14301 SelectionDAG &DAG = DCI.DAG;
14302 SDValue LHS = N->getOperand(0);
14303
14304 // uaddo_carry (add x, y), 0, cc => uaddo_carry x, y, cc
14305 // usubo_carry (sub x, y), 0, cc => usubo_carry x, y, cc
14306 unsigned LHSOpc = LHS.getOpcode();
14307 unsigned Opc = N->getOpcode();
14308 if ((LHSOpc == ISD::ADD && Opc == ISD::UADDO_CARRY) ||
14309 (LHSOpc == ISD::SUB && Opc == ISD::USUBO_CARRY)) {
14310 SDValue Args[] = { LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2) };
14311 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args);
14312 }
14313 return SDValue();
14314}
14315
14316SDValue SITargetLowering::performFAddCombine(SDNode *N,
14317 DAGCombinerInfo &DCI) const {
14318 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
14319 return SDValue();
14320
14321 SelectionDAG &DAG = DCI.DAG;
14322 EVT VT = N->getValueType(0);
14323
14324 SDLoc SL(N);
14325 SDValue LHS = N->getOperand(0);
14326 SDValue RHS = N->getOperand(1);
14327
14328 // These should really be instruction patterns, but writing patterns with
14329 // source modifiers is a pain.
14330
14331 // fadd (fadd (a, a), b) -> mad 2.0, a, b
14332 if (LHS.getOpcode() == ISD::FADD) {
14333 SDValue A = LHS.getOperand(0);
14334 if (A == LHS.getOperand(1)) {
14335 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
14336 if (FusedOp != 0) {
14337 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
14338 return DAG.getNode(FusedOp, SL, VT, A, Two, RHS);
14339 }
14340 }
14341 }
14342
14343 // fadd (b, fadd (a, a)) -> mad 2.0, a, b
14344 if (RHS.getOpcode() == ISD::FADD) {
14345 SDValue A = RHS.getOperand(0);
14346 if (A == RHS.getOperand(1)) {
14347 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
14348 if (FusedOp != 0) {
14349 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
14350 return DAG.getNode(FusedOp, SL, VT, A, Two, LHS);
14351 }
14352 }
14353 }
14354
14355 return SDValue();
14356}
14357
14358SDValue SITargetLowering::performFSubCombine(SDNode *N,
14359 DAGCombinerInfo &DCI) const {
14360 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
14361 return SDValue();
14362
14363 SelectionDAG &DAG = DCI.DAG;
14364 SDLoc SL(N);
14365 EVT VT = N->getValueType(0);
14366 assert(!VT.isVector());
14367
14368 // Try to get the fneg to fold into the source modifier. This undoes generic
14369 // DAG combines and folds them into the mad.
14370 //
14371 // Only do this if we are not trying to support denormals. v_mad_f32 does
14372 // not support denormals ever.
14373 SDValue LHS = N->getOperand(0);
14374 SDValue RHS = N->getOperand(1);
14375 if (LHS.getOpcode() == ISD::FADD) {
14376 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
14377 SDValue A = LHS.getOperand(0);
14378 if (A == LHS.getOperand(1)) {
14379 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
14380 if (FusedOp != 0){
14381 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
14382 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
14383
14384 return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS);
14385 }
14386 }
14387 }
14388
14389 if (RHS.getOpcode() == ISD::FADD) {
14390 // (fsub c, (fadd a, a)) -> mad -2.0, a, c
14391
14392 SDValue A = RHS.getOperand(0);
14393 if (A == RHS.getOperand(1)) {
14394 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
14395 if (FusedOp != 0){
14396 const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT);
14397 return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS);
14398 }
14399 }
14400 }
14401
14402 return SDValue();
14403}
14404
14405SDValue SITargetLowering::performFDivCombine(SDNode *N,
14406 DAGCombinerInfo &DCI) const {
14407 SelectionDAG &DAG = DCI.DAG;
14408 SDLoc SL(N);
14409 EVT VT = N->getValueType(0);
14410 if (VT != MVT::f16 || !Subtarget->has16BitInsts())
14411 return SDValue();
14412
14413 SDValue LHS = N->getOperand(0);
14414 SDValue RHS = N->getOperand(1);
14415
14416 SDNodeFlags Flags = N->getFlags();
14417 SDNodeFlags RHSFlags = RHS->getFlags();
14418 if (!Flags.hasAllowContract() || !RHSFlags.hasAllowContract() ||
14419 !RHS->hasOneUse())
14420 return SDValue();
14421
14422 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
14423 bool IsNegative = false;
14424 if (CLHS->isExactlyValue(1.0) ||
14425 (IsNegative = CLHS->isExactlyValue(-1.0))) {
14426 // fdiv contract 1.0, (sqrt contract x) -> rsq for f16
14427 // fdiv contract -1.0, (sqrt contract x) -> fneg(rsq) for f16
14428 if (RHS.getOpcode() == ISD::FSQRT) {
14429 // TODO: Or in RHS flags, somehow missing from SDNodeFlags
14430 SDValue Rsq =
14431 DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0), Flags);
14432 return IsNegative ? DAG.getNode(ISD::FNEG, SL, VT, Rsq, Flags) : Rsq;
14433 }
14434 }
14435 }
14436
14437 return SDValue();
14438}
14439
14440SDValue SITargetLowering::performFMACombine(SDNode *N,
14441 DAGCombinerInfo &DCI) const {
14442 SelectionDAG &DAG = DCI.DAG;
14443 EVT VT = N->getValueType(0);
14444 SDLoc SL(N);
14445
14446 if (!Subtarget->hasDot7Insts() || VT != MVT::f32)
14447 return SDValue();
14448
14449 // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
14450 // FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
14451 SDValue Op1 = N->getOperand(0);
14452 SDValue Op2 = N->getOperand(1);
14453 SDValue FMA = N->getOperand(2);
14454
14455 if (FMA.getOpcode() != ISD::FMA ||
14456 Op1.getOpcode() != ISD::FP_EXTEND ||
14457 Op2.getOpcode() != ISD::FP_EXTEND)
14458 return SDValue();
14459
14460 // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
14461 // regardless of the denorm mode setting. Therefore,
14462 // unsafe-fp-math/fp-contract is sufficient to allow generating fdot2.
14463 const TargetOptions &Options = DAG.getTarget().Options;
14464 if (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
14465 (N->getFlags().hasAllowContract() &&
14466 FMA->getFlags().hasAllowContract())) {
14467 Op1 = Op1.getOperand(0);
14468 Op2 = Op2.getOperand(0);
14469 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
14471 return SDValue();
14472
14473 SDValue Vec1 = Op1.getOperand(0);
14474 SDValue Idx1 = Op1.getOperand(1);
14475 SDValue Vec2 = Op2.getOperand(0);
14476
14477 SDValue FMAOp1 = FMA.getOperand(0);
14478 SDValue FMAOp2 = FMA.getOperand(1);
14479 SDValue FMAAcc = FMA.getOperand(2);
14480
14481 if (FMAOp1.getOpcode() != ISD::FP_EXTEND ||
14482 FMAOp2.getOpcode() != ISD::FP_EXTEND)
14483 return SDValue();
14484
14485 FMAOp1 = FMAOp1.getOperand(0);
14486 FMAOp2 = FMAOp2.getOperand(0);
14487 if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
14489 return SDValue();
14490
14491 SDValue Vec3 = FMAOp1.getOperand(0);
14492 SDValue Vec4 = FMAOp2.getOperand(0);
14493 SDValue Idx2 = FMAOp1.getOperand(1);
14494
14495 if (Idx1 != Op2.getOperand(1) || Idx2 != FMAOp2.getOperand(1) ||
14496 // Idx1 and Idx2 cannot be the same.
14497 Idx1 == Idx2)
14498 return SDValue();
14499
14500 if (Vec1 == Vec2 || Vec3 == Vec4)
14501 return SDValue();
14502
14503 if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16)
14504 return SDValue();
14505
14506 if ((Vec1 == Vec3 && Vec2 == Vec4) ||
14507 (Vec1 == Vec4 && Vec2 == Vec3)) {
14508 return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
14509 DAG.getTargetConstant(0, SL, MVT::i1));
14510 }
14511 }
14512 return SDValue();
14513}
14514
14515SDValue SITargetLowering::performSetCCCombine(SDNode *N,
14516 DAGCombinerInfo &DCI) const {
14517 SelectionDAG &DAG = DCI.DAG;
14518 SDLoc SL(N);
14519
14520 SDValue LHS = N->getOperand(0);
14521 SDValue RHS = N->getOperand(1);
14522 EVT VT = LHS.getValueType();
14523 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
14524
14525 auto CRHS = dyn_cast<ConstantSDNode>(RHS);
14526 if (!CRHS) {
14527 CRHS = dyn_cast<ConstantSDNode>(LHS);
14528 if (CRHS) {
14529 std::swap(LHS, RHS);
14531 }
14532 }
14533
14534 if (CRHS) {
14535 if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
14536 isBoolSGPR(LHS.getOperand(0))) {
14537 // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
14538 // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
14539 // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
14540 // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc
14541 if ((CRHS->isAllOnes() &&
14542 (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
14543 (CRHS->isZero() &&
14544 (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
14545 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
14546 DAG.getConstant(-1, SL, MVT::i1));
14547 if ((CRHS->isAllOnes() &&
14548 (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
14549 (CRHS->isZero() &&
14550 (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
14551 return LHS.getOperand(0);
14552 }
14553
14554 const APInt &CRHSVal = CRHS->getAPIntValue();
14555 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
14556 LHS.getOpcode() == ISD::SELECT &&
14557 isa<ConstantSDNode>(LHS.getOperand(1)) &&
14558 isa<ConstantSDNode>(LHS.getOperand(2)) &&
14559 LHS.getConstantOperandVal(1) != LHS.getConstantOperandVal(2) &&
14560 isBoolSGPR(LHS.getOperand(0))) {
14561 // Given CT != FT:
14562 // setcc (select cc, CT, CF), CF, eq => xor cc, -1
14563 // setcc (select cc, CT, CF), CF, ne => cc
14564 // setcc (select cc, CT, CF), CT, ne => xor cc, -1
14565 // setcc (select cc, CT, CF), CT, eq => cc
14566 const APInt &CT = LHS.getConstantOperandAPInt(1);
14567 const APInt &CF = LHS.getConstantOperandAPInt(2);
14568
14569 if ((CF == CRHSVal && CC == ISD::SETEQ) ||
14570 (CT == CRHSVal && CC == ISD::SETNE))
14571 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
14572 DAG.getConstant(-1, SL, MVT::i1));
14573 if ((CF == CRHSVal && CC == ISD::SETNE) ||
14574 (CT == CRHSVal && CC == ISD::SETEQ))
14575 return LHS.getOperand(0);
14576 }
14577 }
14578
14579 if (VT != MVT::f32 && VT != MVT::f64 &&
14580 (!Subtarget->has16BitInsts() || VT != MVT::f16))
14581 return SDValue();
14582
14583 // Match isinf/isfinite pattern
14584 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
14585 // (fcmp one (fabs x), inf) -> (fp_class x,
14586 // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)
14587 if ((CC == ISD::SETOEQ || CC == ISD::SETONE) && LHS.getOpcode() == ISD::FABS) {
14588 const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
14589 if (!CRHS)
14590 return SDValue();
14591
14592 const APFloat &APF = CRHS->getValueAPF();
14593 if (APF.isInfinity() && !APF.isNegative()) {
14594 const unsigned IsInfMask = SIInstrFlags::P_INFINITY |
14596 const unsigned IsFiniteMask = SIInstrFlags::N_ZERO |
14602 unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask;
14603 return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
14604 DAG.getConstant(Mask, SL, MVT::i32));
14605 }
14606 }
14607
14608 return SDValue();
14609}
14610
14611SDValue SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
14612 DAGCombinerInfo &DCI) const {
14613 SelectionDAG &DAG = DCI.DAG;
14614 SDLoc SL(N);
14615 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
14616
14617 SDValue Src = N->getOperand(0);
14618 SDValue Shift = N->getOperand(0);
14619
14620 // TODO: Extend type shouldn't matter (assuming legal types).
14621 if (Shift.getOpcode() == ISD::ZERO_EXTEND)
14622 Shift = Shift.getOperand(0);
14623
14624 if (Shift.getOpcode() == ISD::SRL || Shift.getOpcode() == ISD::SHL) {
14625 // cvt_f32_ubyte1 (shl x, 8) -> cvt_f32_ubyte0 x
14626 // cvt_f32_ubyte3 (shl x, 16) -> cvt_f32_ubyte1 x
14627 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
14628 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
14629 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
14630 if (auto *C = dyn_cast<ConstantSDNode>(Shift.getOperand(1))) {
14631 SDValue Shifted = DAG.getZExtOrTrunc(Shift.getOperand(0),
14632 SDLoc(Shift.getOperand(0)), MVT::i32);
14633
14634 unsigned ShiftOffset = 8 * Offset;
14635 if (Shift.getOpcode() == ISD::SHL)
14636 ShiftOffset -= C->getZExtValue();
14637 else
14638 ShiftOffset += C->getZExtValue();
14639
14640 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
14641 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL,
14642 MVT::f32, Shifted);
14643 }
14644 }
14645 }
14646
14647 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14648 APInt DemandedBits = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
14649 if (TLI.SimplifyDemandedBits(Src, DemandedBits, DCI)) {
14650 // We simplified Src. If this node is not dead, visit it again so it is
14651 // folded properly.
14652 if (N->getOpcode() != ISD::DELETED_NODE)
14653 DCI.AddToWorklist(N);
14654 return SDValue(N, 0);
14655 }
14656
14657 // Handle (or x, (srl y, 8)) pattern when known bits are zero.
14658 if (SDValue DemandedSrc =
14660 return DAG.getNode(N->getOpcode(), SL, MVT::f32, DemandedSrc);
14661
14662 return SDValue();
14663}
14664
14665SDValue SITargetLowering::performClampCombine(SDNode *N,
14666 DAGCombinerInfo &DCI) const {
14667 ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
14668 if (!CSrc)
14669 return SDValue();
14670
14671 const MachineFunction &MF = DCI.DAG.getMachineFunction();
14672 const APFloat &F = CSrc->getValueAPF();
14673 APFloat Zero = APFloat::getZero(F.getSemantics());
14674 if (F < Zero ||
14675 (F.isNaN() && MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
14676 return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
14677 }
14678
14679 APFloat One(F.getSemantics(), "1.0");
14680 if (F > One)
14681 return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
14682
14683 return SDValue(CSrc, 0);
14684}
14685
14686
14688 DAGCombinerInfo &DCI) const {
14689 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
14690 return SDValue();
14691 switch (N->getOpcode()) {
14692 case ISD::ADD:
14693 return performAddCombine(N, DCI);
14694 case ISD::SUB:
14695 return performSubCombine(N, DCI);
14696 case ISD::UADDO_CARRY:
14697 case ISD::USUBO_CARRY:
14698 return performAddCarrySubCarryCombine(N, DCI);
14699 case ISD::FADD:
14700 return performFAddCombine(N, DCI);
14701 case ISD::FSUB:
14702 return performFSubCombine(N, DCI);
14703 case ISD::FDIV:
14704 return performFDivCombine(N, DCI);
14705 case ISD::SETCC:
14706 return performSetCCCombine(N, DCI);
14707 case ISD::FMAXNUM:
14708 case ISD::FMINNUM:
14709 case ISD::FMAXNUM_IEEE:
14710 case ISD::FMINNUM_IEEE:
14711 case ISD::FMAXIMUM:
14712 case ISD::FMINIMUM:
14713 case ISD::SMAX:
14714 case ISD::SMIN:
14715 case ISD::UMAX:
14716 case ISD::UMIN:
14719 return performMinMaxCombine(N, DCI);
14720 case ISD::FMA:
14721 return performFMACombine(N, DCI);
14722 case ISD::AND:
14723 return performAndCombine(N, DCI);
14724 case ISD::OR:
14725 return performOrCombine(N, DCI);
14726 case ISD::FSHR: {
14728 if (N->getValueType(0) == MVT::i32 && N->isDivergent() &&
14729 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
14730 return matchPERM(N, DCI);
14731 }
14732 break;
14733 }
14734 case ISD::XOR:
14735 return performXorCombine(N, DCI);
14736 case ISD::ZERO_EXTEND:
14737 return performZeroExtendCombine(N, DCI);
14739 return performSignExtendInRegCombine(N , DCI);
14741 return performClassCombine(N, DCI);
14742 case ISD::FCANONICALIZE:
14743 return performFCanonicalizeCombine(N, DCI);
14744 case AMDGPUISD::RCP:
14745 return performRcpCombine(N, DCI);
14746 case ISD::FLDEXP:
14747 case AMDGPUISD::FRACT:
14748 case AMDGPUISD::RSQ:
14751 case AMDGPUISD::RSQ_CLAMP: {
14752 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
14753 SDValue Src = N->getOperand(0);
14754 if (Src.isUndef())
14755 return Src;
14756 break;
14757 }
14758 case ISD::SINT_TO_FP:
14759 case ISD::UINT_TO_FP:
14760 return performUCharToFloatCombine(N, DCI);
14761 case ISD::FCOPYSIGN:
14762 return performFCopySignCombine(N, DCI);
14767 return performCvtF32UByteNCombine(N, DCI);
14768 case AMDGPUISD::FMED3:
14769 return performFMed3Combine(N, DCI);
14771 return performCvtPkRTZCombine(N, DCI);
14772 case AMDGPUISD::CLAMP:
14773 return performClampCombine(N, DCI);
14774 case ISD::SCALAR_TO_VECTOR: {
14775 SelectionDAG &DAG = DCI.DAG;
14776 EVT VT = N->getValueType(0);
14777
14778 // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
14779 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
14780 SDLoc SL(N);
14781 SDValue Src = N->getOperand(0);
14782 EVT EltVT = Src.getValueType();
14783 if (EltVT != MVT::i16)
14784 Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);
14785
14786 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);
14787 return DAG.getNode(ISD::BITCAST, SL, VT, Ext);
14788 }
14789
14790 break;
14791 }
14793 return performExtractVectorEltCombine(N, DCI);
14795 return performInsertVectorEltCombine(N, DCI);
14796 case ISD::FP_ROUND:
14797 return performFPRoundCombine(N, DCI);
14798 case ISD::LOAD: {
14799 if (SDValue Widened = widenLoad(cast<LoadSDNode>(N), DCI))
14800 return Widened;
14801 [[fallthrough]];
14802 }
14803 default: {
14804 if (!DCI.isBeforeLegalize()) {
14805 if (MemSDNode *MemNode = dyn_cast<MemSDNode>(N))
14806 return performMemSDNodeCombine(MemNode, DCI);
14807 }
14808
14809 break;
14810 }
14811 }
14812
14814}
14815
14816/// Helper function for adjustWritemask
14817static unsigned SubIdx2Lane(unsigned Idx) {
14818 switch (Idx) {
14819 default: return ~0u;
14820 case AMDGPU::sub0: return 0;
14821 case AMDGPU::sub1: return 1;
14822 case AMDGPU::sub2: return 2;
14823 case AMDGPU::sub3: return 3;
14824 case AMDGPU::sub4: return 4; // Possible with TFE/LWE
14825 }
14826}
14827
14828/// Adjust the writemask of MIMG, VIMAGE or VSAMPLE instructions
14829SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
14830 SelectionDAG &DAG) const {
14831 unsigned Opcode = Node->getMachineOpcode();
14832
14833 // Subtract 1 because the vdata output is not a MachineSDNode operand.
14834 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
14835 if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx))
14836 return Node; // not implemented for D16
14837
14838 SDNode *Users[5] = { nullptr };
14839 unsigned Lane = 0;
14840 unsigned DmaskIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
14841 unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
14842 unsigned NewDmask = 0;
14843 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
14844 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
14845 bool UsesTFC = ((int(TFEIdx) >= 0 && Node->getConstantOperandVal(TFEIdx)) ||
14846 (int(LWEIdx) >= 0 && Node->getConstantOperandVal(LWEIdx)))
14847 ? true
14848 : false;
14849 unsigned TFCLane = 0;
14850 bool HasChain = Node->getNumValues() > 1;
14851
14852 if (OldDmask == 0) {
14853 // These are folded out, but on the chance it happens don't assert.
14854 return Node;
14855 }
14856
14857 unsigned OldBitsSet = llvm::popcount(OldDmask);
14858 // Work out which is the TFE/LWE lane if that is enabled.
14859 if (UsesTFC) {
14860 TFCLane = OldBitsSet;
14861 }
14862
14863 // Try to figure out the used register components
14864 for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end();
14865 I != E; ++I) {
14866
14867 // Don't look at users of the chain.
14868 if (I.getUse().getResNo() != 0)
14869 continue;
14870
14871 // Abort if we can't understand the usage
14872 if (!I->isMachineOpcode() ||
14873 I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
14874 return Node;
14875
14876 // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
14877 // Note that subregs are packed, i.e. Lane==0 is the first bit set
14878 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
14879 // set, etc.
14880 Lane = SubIdx2Lane(I->getConstantOperandVal(1));
14881 if (Lane == ~0u)
14882 return Node;
14883
14884 // Check if the use is for the TFE/LWE generated result at VGPRn+1.
14885 if (UsesTFC && Lane == TFCLane) {
14886 Users[Lane] = *I;
14887 } else {
14888 // Set which texture component corresponds to the lane.
14889 unsigned Comp;
14890 for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
14891 Comp = llvm::countr_zero(Dmask);
14892 Dmask &= ~(1 << Comp);
14893 }
14894
14895 // Abort if we have more than one user per component.
14896 if (Users[Lane])
14897 return Node;
14898
14899 Users[Lane] = *I;
14900 NewDmask |= 1 << Comp;
14901 }
14902 }
14903
14904 // Don't allow 0 dmask, as hardware assumes one channel enabled.
14905 bool NoChannels = !NewDmask;
14906 if (NoChannels) {
14907 if (!UsesTFC) {
14908 // No uses of the result and not using TFC. Then do nothing.
14909 return Node;
14910 }
14911 // If the original dmask has one channel - then nothing to do
14912 if (OldBitsSet == 1)
14913 return Node;
14914 // Use an arbitrary dmask - required for the instruction to work
14915 NewDmask = 1;
14916 }
14917 // Abort if there's no change
14918 if (NewDmask == OldDmask)
14919 return Node;
14920
14921 unsigned BitsSet = llvm::popcount(NewDmask);
14922
14923 // Check for TFE or LWE - increase the number of channels by one to account
14924 // for the extra return value
14925 // This will need adjustment for D16 if this is also included in
14926 // adjustWriteMask (this function) but at present D16 are excluded.
14927 unsigned NewChannels = BitsSet + UsesTFC;
14928
14929 int NewOpcode =
14930 AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), NewChannels);
14931 assert(NewOpcode != -1 &&
14932 NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
14933 "failed to find equivalent MIMG op");
14934
14935 // Adjust the writemask in the node
14937 Ops.insert(Ops.end(), Node->op_begin(), Node->op_begin() + DmaskIdx);
14938 Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
14939 Ops.insert(Ops.end(), Node->op_begin() + DmaskIdx + 1, Node->op_end());
14940
14941 MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
14942
14943 MVT ResultVT = NewChannels == 1 ?
14944 SVT : MVT::getVectorVT(SVT, NewChannels == 3 ? 4 :
14945 NewChannels == 5 ? 8 : NewChannels);
14946 SDVTList NewVTList = HasChain ?
14947 DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT);
14948
14949
14950 MachineSDNode *NewNode = DAG.getMachineNode(NewOpcode, SDLoc(Node),
14951 NewVTList, Ops);
14952
14953 if (HasChain) {
14954 // Update chain.
14955 DAG.setNodeMemRefs(NewNode, Node->memoperands());
14956 DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
14957 }
14958
14959 if (NewChannels == 1) {
14960 assert(Node->hasNUsesOfValue(1, 0));
14961 SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY,
14962 SDLoc(Node), Users[Lane]->getValueType(0),
14963 SDValue(NewNode, 0));
14964 DAG.ReplaceAllUsesWith(Users[Lane], Copy);
14965 return nullptr;
14966 }
14967
14968 // Update the users of the node with the new indices
14969 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
14970 SDNode *User = Users[i];
14971 if (!User) {
14972 // Handle the special case of NoChannels. We set NewDmask to 1 above, but
14973 // Users[0] is still nullptr because channel 0 doesn't really have a use.
14974 if (i || !NoChannels)
14975 continue;
14976 } else {
14977 SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
14978 SDNode *NewUser = DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
14979 if (NewUser != User) {
14980 DAG.ReplaceAllUsesWith(SDValue(User, 0), SDValue(NewUser, 0));
14981 DAG.RemoveDeadNode(User);
14982 }
14983 }
14984
14985 switch (Idx) {
14986 default: break;
14987 case AMDGPU::sub0: Idx = AMDGPU::sub1; break;
14988 case AMDGPU::sub1: Idx = AMDGPU::sub2; break;
14989 case AMDGPU::sub2: Idx = AMDGPU::sub3; break;
14990 case AMDGPU::sub3: Idx = AMDGPU::sub4; break;
14991 }
14992 }
14993
14994 DAG.RemoveDeadNode(Node);
14995 return nullptr;
14996}
14997
14999 if (Op.getOpcode() == ISD::AssertZext)
15000 Op = Op.getOperand(0);
15001
15002 return isa<FrameIndexSDNode>(Op);
15003}
15004
15005/// Legalize target independent instructions (e.g. INSERT_SUBREG)
15006/// with frame index operands.
15007/// LLVM assumes that inputs are to these instructions are registers.
15009 SelectionDAG &DAG) const {
15010 if (Node->getOpcode() == ISD::CopyToReg) {
15011 RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
15012 SDValue SrcVal = Node->getOperand(2);
15013
15014 // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
15015 // to try understanding copies to physical registers.
15016 if (SrcVal.getValueType() == MVT::i1 && DestReg->getReg().isPhysical()) {
15017 SDLoc SL(Node);
15019 SDValue VReg = DAG.getRegister(
15020 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
15021
15022 SDNode *Glued = Node->getGluedNode();
15023 SDValue ToVReg
15024 = DAG.getCopyToReg(Node->getOperand(0), SL, VReg, SrcVal,
15025 SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
15026 SDValue ToResultReg
15027 = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0),
15028 VReg, ToVReg.getValue(1));
15029 DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode());
15030 DAG.RemoveDeadNode(Node);
15031 return ToResultReg.getNode();
15032 }
15033 }
15034
15036 for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
15037 if (!isFrameIndexOp(Node->getOperand(i))) {
15038 Ops.push_back(Node->getOperand(i));
15039 continue;
15040 }
15041
15042 SDLoc DL(Node);
15043 Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
15044 Node->getOperand(i).getValueType(),
15045 Node->getOperand(i)), 0));
15046 }
15047
15048 return DAG.UpdateNodeOperands(Node, Ops);
15049}
15050
15051/// Fold the instructions after selecting them.
15052/// Returns null if users were already updated.
15054 SelectionDAG &DAG) const {
15056 unsigned Opcode = Node->getMachineOpcode();
15057
15058 if (TII->isImage(Opcode) && !TII->get(Opcode).mayStore() &&
15059 !TII->isGather4(Opcode) &&
15060 AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::dmask)) {
15061 return adjustWritemask(Node, DAG);
15062 }
15063
15064 if (Opcode == AMDGPU::INSERT_SUBREG ||
15065 Opcode == AMDGPU::REG_SEQUENCE) {
15067 return Node;
15068 }
15069
15070 switch (Opcode) {
15071 case AMDGPU::V_DIV_SCALE_F32_e64:
15072 case AMDGPU::V_DIV_SCALE_F64_e64: {
15073 // Satisfy the operand register constraint when one of the inputs is
15074 // undefined. Ordinarily each undef value will have its own implicit_def of
15075 // a vreg, so force these to use a single register.
15076 SDValue Src0 = Node->getOperand(1);
15077 SDValue Src1 = Node->getOperand(3);
15078 SDValue Src2 = Node->getOperand(5);
15079
15080 if ((Src0.isMachineOpcode() &&
15081 Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
15082 (Src0 == Src1 || Src0 == Src2))
15083 break;
15084
15085 MVT VT = Src0.getValueType().getSimpleVT();
15086 const TargetRegisterClass *RC =
15087 getRegClassFor(VT, Src0.getNode()->isDivergent());
15088
15090 SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);
15091
15092 SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node),
15093 UndefReg, Src0, SDValue());
15094
15095 // src0 must be the same register as src1 or src2, even if the value is
15096 // undefined, so make sure we don't violate this constraint.
15097 if (Src0.isMachineOpcode() &&
15098 Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
15099 if (Src1.isMachineOpcode() &&
15100 Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
15101 Src0 = Src1;
15102 else if (Src2.isMachineOpcode() &&
15103 Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
15104 Src0 = Src2;
15105 else {
15106 assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
15107 Src0 = UndefReg;
15108 Src1 = UndefReg;
15109 }
15110 } else
15111 break;
15112
15113 SmallVector<SDValue, 9> Ops(Node->ops());
15114 Ops[1] = Src0;
15115 Ops[3] = Src1;
15116 Ops[5] = Src2;
15117 Ops.push_back(ImpDef.getValue(1));
15118 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
15119 }
15120 default:
15121 break;
15122 }
15123
15124 return Node;
15125}
15126
15127// Any MIMG instructions that use tfe or lwe require an initialization of the
15128// result register that will be written in the case of a memory access failure.
15129// The required code is also added to tie this init code to the result of the
15130// img instruction.
15133 const SIRegisterInfo &TRI = TII->getRegisterInfo();
15134 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
15135 MachineBasicBlock &MBB = *MI.getParent();
15136
15137 int DstIdx =
15138 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
15139 unsigned InitIdx = 0;
15140
15141 if (TII->isImage(MI)) {
15142 MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe);
15143 MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe);
15144 MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16);
15145
15146 if (!TFE && !LWE) // intersect_ray
15147 return;
15148
15149 unsigned TFEVal = TFE ? TFE->getImm() : 0;
15150 unsigned LWEVal = LWE ? LWE->getImm() : 0;
15151 unsigned D16Val = D16 ? D16->getImm() : 0;
15152
15153 if (!TFEVal && !LWEVal)
15154 return;
15155
15156 // At least one of TFE or LWE are non-zero
15157 // We have to insert a suitable initialization of the result value and
15158 // tie this to the dest of the image instruction.
15159
15160 // Calculate which dword we have to initialize to 0.
15161 MachineOperand *MO_Dmask = TII->getNamedOperand(MI, AMDGPU::OpName::dmask);
15162
15163 // check that dmask operand is found.
15164 assert(MO_Dmask && "Expected dmask operand in instruction");
15165
15166 unsigned dmask = MO_Dmask->getImm();
15167 // Determine the number of active lanes taking into account the
15168 // Gather4 special case
15169 unsigned ActiveLanes = TII->isGather4(MI) ? 4 : llvm::popcount(dmask);
15170
15171 bool Packed = !Subtarget->hasUnpackedD16VMem();
15172
15173 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
15174
15175 // Abandon attempt if the dst size isn't large enough
15176 // - this is in fact an error but this is picked up elsewhere and
15177 // reported correctly.
15178 uint32_t DstSize =
15179 TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
15180 if (DstSize < InitIdx)
15181 return;
15182 } else if (TII->isMUBUF(MI) && AMDGPU::getMUBUFTfe(MI.getOpcode())) {
15183 InitIdx = TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
15184 } else {
15185 return;
15186 }
15187
15188 const DebugLoc &DL = MI.getDebugLoc();
15189
15190 // Create a register for the initialization value.
15191 Register PrevDst = MRI.cloneVirtualRegister(MI.getOperand(DstIdx).getReg());
15192 unsigned NewDst = 0; // Final initialized value will be in here
15193
15194 // If PRTStrictNull feature is enabled (the default) then initialize
15195 // all the result registers to 0, otherwise just the error indication
15196 // register (VGPRn+1)
15197 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
15198 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
15199
15200 BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst);
15201 for (; SizeLeft; SizeLeft--, CurrIdx++) {
15202 NewDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
15203 // Initialize dword
15204 Register SubReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
15205 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg)
15206 .addImm(0);
15207 // Insert into the super-reg
15208 BuildMI(MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst)
15209 .addReg(PrevDst)
15210 .addReg(SubReg)
15212
15213 PrevDst = NewDst;
15214 }
15215
15216 // Add as an implicit operand
15217 MI.addOperand(MachineOperand::CreateReg(NewDst, false, true));
15218
15219 // Tie the just added implicit operand to the dst
15220 MI.tieOperands(DstIdx, MI.getNumOperands() - 1);
15221}
15222
15223/// Assign the register class depending on the number of
15224/// bits set in the writemask
15226 SDNode *Node) const {
15228
15229 MachineFunction *MF = MI.getParent()->getParent();
15232
15233 if (TII->isVOP3(MI.getOpcode())) {
15234 // Make sure constant bus requirements are respected.
15235 TII->legalizeOperandsVOP3(MRI, MI);
15236
15237 // Prefer VGPRs over AGPRs in mAI instructions where possible.
15238 // This saves a chain-copy of registers and better balance register
15239 // use between vgpr and agpr as agpr tuples tend to be big.
15240 if (!MI.getDesc().operands().empty()) {
15241 unsigned Opc = MI.getOpcode();
15242 bool HasAGPRs = Info->mayNeedAGPRs();
15243 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
15244 int16_t Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
15245 for (auto I :
15246 {AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
15247 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), Src2Idx}) {
15248 if (I == -1)
15249 break;
15250 if ((I == Src2Idx) && (HasAGPRs))
15251 break;
15252 MachineOperand &Op = MI.getOperand(I);
15253 if (!Op.isReg() || !Op.getReg().isVirtual())
15254 continue;
15255 auto *RC = TRI->getRegClassForReg(MRI, Op.getReg());
15256 if (!TRI->hasAGPRs(RC))
15257 continue;
15258 auto *Src = MRI.getUniqueVRegDef(Op.getReg());
15259 if (!Src || !Src->isCopy() ||
15260 !TRI->isSGPRReg(MRI, Src->getOperand(1).getReg()))
15261 continue;
15262 auto *NewRC = TRI->getEquivalentVGPRClass(RC);
15263 // All uses of agpr64 and agpr32 can also accept vgpr except for
15264 // v_accvgpr_read, but we do not produce agpr reads during selection,
15265 // so no use checks are needed.
15266 MRI.setRegClass(Op.getReg(), NewRC);
15267 }
15268
15269 if (!HasAGPRs)
15270 return;
15271
15272 // Resolve the rest of AV operands to AGPRs.
15273 if (auto *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2)) {
15274 if (Src2->isReg() && Src2->getReg().isVirtual()) {
15275 auto *RC = TRI->getRegClassForReg(MRI, Src2->getReg());
15276 if (TRI->isVectorSuperClass(RC)) {
15277 auto *NewRC = TRI->getEquivalentAGPRClass(RC);
15278 MRI.setRegClass(Src2->getReg(), NewRC);
15279 if (Src2->isTied())
15280 MRI.setRegClass(MI.getOperand(0).getReg(), NewRC);
15281 }
15282 }
15283 }
15284 }
15285
15286 return;
15287 }
15288
15289 if (TII->isImage(MI))
15290 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::vaddr);
15291}
15292
15294 uint64_t Val) {
15295 SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
15296 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
15297}
15298
15300 const SDLoc &DL,
15301 SDValue Ptr) const {
15303
15304 // Build the half of the subregister with the constants before building the
15305 // full 128-bit register. If we are building multiple resource descriptors,
15306 // this will allow CSEing of the 2-component register.
15307 const SDValue Ops0[] = {
15308 DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
15309 buildSMovImm32(DAG, DL, 0),
15310 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
15311 buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
15312 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)
15313 };
15314
15315 SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL,
15316 MVT::v2i32, Ops0), 0);
15317
15318 // Combine the constants and the pointer.
15319 const SDValue Ops1[] = {
15320 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32),
15321 Ptr,
15322 DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32),
15323 SubRegHi,
15324 DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)
15325 };
15326
15327 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
15328}
15329
15330/// Return a resource descriptor with the 'Add TID' bit enabled
15331/// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
15332/// of the resource descriptor) to create an offset, which is added to
15333/// the resource pointer.
15335 SDValue Ptr, uint32_t RsrcDword1,
15336 uint64_t RsrcDword2And3) const {
15337 SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
15338 SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
15339 if (RsrcDword1) {
15340 PtrHi = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
15341 DAG.getConstant(RsrcDword1, DL, MVT::i32)),
15342 0);
15343 }
15344
15345 SDValue DataLo = buildSMovImm32(DAG, DL,
15346 RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
15347 SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
15348
15349 const SDValue Ops[] = {
15350 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32),
15351 PtrLo,
15352 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
15353 PtrHi,
15354 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
15355 DataLo,
15356 DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
15357 DataHi,
15358 DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)
15359 };
15360
15361 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
15362}
15363
15364//===----------------------------------------------------------------------===//
15365// SI Inline Assembly Support
15366//===----------------------------------------------------------------------===//
15367
15368std::pair<unsigned, const TargetRegisterClass *>
15370 StringRef Constraint,
15371 MVT VT) const {
15372 const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(TRI_);
15373
15374 const TargetRegisterClass *RC = nullptr;
15375 if (Constraint.size() == 1) {
15376 const unsigned BitWidth = VT.getSizeInBits();
15377 switch (Constraint[0]) {
15378 default:
15379 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
15380 case 's':
15381 case 'r':
15382 switch (BitWidth) {
15383 case 16:
15384 RC = &AMDGPU::SReg_32RegClass;
15385 break;
15386 case 64:
15387 RC = &AMDGPU::SGPR_64RegClass;
15388 break;
15389 default:
15391 if (!RC)
15392 return std::pair(0U, nullptr);
15393 break;
15394 }
15395 break;
15396 case 'v':
15397 switch (BitWidth) {
15398 case 16:
15399 RC = &AMDGPU::VGPR_32RegClass;
15400 break;
15401 default:
15402 RC = TRI->getVGPRClassForBitWidth(BitWidth);
15403 if (!RC)
15404 return std::pair(0U, nullptr);
15405 break;
15406 }
15407 break;
15408 case 'a':
15409 if (!Subtarget->hasMAIInsts())
15410 break;
15411 switch (BitWidth) {
15412 case 16:
15413 RC = &AMDGPU::AGPR_32RegClass;
15414 break;
15415 default:
15416 RC = TRI->getAGPRClassForBitWidth(BitWidth);
15417 if (!RC)
15418 return std::pair(0U, nullptr);
15419 break;
15420 }
15421 break;
15422 }
15423 // We actually support i128, i16 and f16 as inline parameters
15424 // even if they are not reported as legal
15425 if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
15426 VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
15427 return std::pair(0U, RC);
15428 }
15429
15430 if (Constraint.starts_with("{") && Constraint.ends_with("}")) {
15431 StringRef RegName(Constraint.data() + 1, Constraint.size() - 2);
15432 if (RegName.consume_front("v")) {
15433 RC = &AMDGPU::VGPR_32RegClass;
15434 } else if (RegName.consume_front("s")) {
15435 RC = &AMDGPU::SGPR_32RegClass;
15436 } else if (RegName.consume_front("a")) {
15437 RC = &AMDGPU::AGPR_32RegClass;
15438 }
15439
15440 if (RC) {
15441 uint32_t Idx;
15442 if (RegName.consume_front("[")) {
15443 uint32_t End;
15444 bool Failed = RegName.consumeInteger(10, Idx);
15445 Failed |= !RegName.consume_front(":");
15446 Failed |= RegName.consumeInteger(10, End);
15447 Failed |= !RegName.consume_back("]");
15448 if (!Failed) {
15449 uint32_t Width = (End - Idx + 1) * 32;
15450 MCRegister Reg = RC->getRegister(Idx);
15452 RC = TRI->getVGPRClassForBitWidth(Width);
15453 else if (SIRegisterInfo::isSGPRClass(RC))
15454 RC = TRI->getSGPRClassForBitWidth(Width);
15455 else if (SIRegisterInfo::isAGPRClass(RC))
15456 RC = TRI->getAGPRClassForBitWidth(Width);
15457 if (RC) {
15458 Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
15459 return std::pair(Reg, RC);
15460 }
15461 }
15462 } else {
15463 bool Failed = RegName.getAsInteger(10, Idx);
15464 if (!Failed && Idx < RC->getNumRegs())
15465 return std::pair(RC->getRegister(Idx), RC);
15466 }
15467 }
15468 }
15469
15470 auto Ret = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
15471 if (Ret.first)
15472 Ret.second = TRI->getPhysRegBaseClass(Ret.first);
15473
15474 return Ret;
15475}
15476
15477static bool isImmConstraint(StringRef Constraint) {
15478 if (Constraint.size() == 1) {
15479 switch (Constraint[0]) {
15480 default: break;
15481 case 'I':
15482 case 'J':
15483 case 'A':
15484 case 'B':
15485 case 'C':
15486 return true;
15487 }
15488 } else if (Constraint == "DA" ||
15489 Constraint == "DB") {
15490 return true;
15491 }
15492 return false;
15493}
15494
15497 if (Constraint.size() == 1) {
15498 switch (Constraint[0]) {
15499 default: break;
15500 case 's':
15501 case 'v':
15502 case 'a':
15503 return C_RegisterClass;
15504 }
15505 }
15506 if (isImmConstraint(Constraint)) {
15507 return C_Other;
15508 }
15509 return TargetLowering::getConstraintType(Constraint);
15510}
15511
15512static uint64_t clearUnusedBits(uint64_t Val, unsigned Size) {
15514 Val = Val & maskTrailingOnes<uint64_t>(Size);
15515 }
15516 return Val;
15517}
15518
15520 StringRef Constraint,
15521 std::vector<SDValue> &Ops,
15522 SelectionDAG &DAG) const {
15523 if (isImmConstraint(Constraint)) {
15524 uint64_t Val;
15525 if (getAsmOperandConstVal(Op, Val) &&
15526 checkAsmConstraintVal(Op, Constraint, Val)) {
15527 Val = clearUnusedBits(Val, Op.getScalarValueSizeInBits());
15528 Ops.push_back(DAG.getTargetConstant(Val, SDLoc(Op), MVT::i64));
15529 }
15530 } else {
15531 TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
15532 }
15533}
15534
15536 unsigned Size = Op.getScalarValueSizeInBits();
15537 if (Size > 64)
15538 return false;
15539
15540 if (Size == 16 && !Subtarget->has16BitInsts())
15541 return false;
15542
15543 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
15544 Val = C->getSExtValue();
15545 return true;
15546 }
15547 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op)) {
15548 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
15549 return true;
15550 }
15551 if (BuildVectorSDNode *V = dyn_cast<BuildVectorSDNode>(Op)) {
15552 if (Size != 16 || Op.getNumOperands() != 2)
15553 return false;
15554 if (Op.getOperand(0).isUndef() || Op.getOperand(1).isUndef())
15555 return false;
15556 if (ConstantSDNode *C = V->getConstantSplatNode()) {
15557 Val = C->getSExtValue();
15558 return true;
15559 }
15560 if (ConstantFPSDNode *C = V->getConstantFPSplatNode()) {
15561 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
15562 return true;
15563 }
15564 }
15565
15566 return false;
15567}
15568
15570 uint64_t Val) const {
15571 if (Constraint.size() == 1) {
15572 switch (Constraint[0]) {
15573 case 'I':
15575 case 'J':
15576 return isInt<16>(Val);
15577 case 'A':
15578 return checkAsmConstraintValA(Op, Val);
15579 case 'B':
15580 return isInt<32>(Val);
15581 case 'C':
15582 return isUInt<32>(clearUnusedBits(Val, Op.getScalarValueSizeInBits())) ||
15584 default:
15585 break;
15586 }
15587 } else if (Constraint.size() == 2) {
15588 if (Constraint == "DA") {
15589 int64_t HiBits = static_cast<int32_t>(Val >> 32);
15590 int64_t LoBits = static_cast<int32_t>(Val);
15591 return checkAsmConstraintValA(Op, HiBits, 32) &&
15592 checkAsmConstraintValA(Op, LoBits, 32);
15593 }
15594 if (Constraint == "DB") {
15595 return true;
15596 }
15597 }
15598 llvm_unreachable("Invalid asm constraint");
15599}
15600
15602 unsigned MaxSize) const {
15603 unsigned Size = std::min<unsigned>(Op.getScalarValueSizeInBits(), MaxSize);
15604 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
15605 if (Size == 16) {
15606 MVT VT = Op.getSimpleValueType();
15607 switch (VT.SimpleTy) {
15608 default:
15609 return false;
15610 case MVT::i16:
15611 return AMDGPU::isInlinableLiteralI16(Val, HasInv2Pi);
15612 case MVT::f16:
15613 return AMDGPU::isInlinableLiteralFP16(Val, HasInv2Pi);
15614 case MVT::bf16:
15615 return AMDGPU::isInlinableLiteralBF16(Val, HasInv2Pi);
15616 case MVT::v2i16:
15617 return AMDGPU::getInlineEncodingV2I16(Val).has_value();
15618 case MVT::v2f16:
15619 return AMDGPU::getInlineEncodingV2F16(Val).has_value();
15620 case MVT::v2bf16:
15621 return AMDGPU::getInlineEncodingV2BF16(Val).has_value();
15622 }
15623 }
15624 if ((Size == 32 && AMDGPU::isInlinableLiteral32(Val, HasInv2Pi)) ||
15625 (Size == 64 && AMDGPU::isInlinableLiteral64(Val, HasInv2Pi)))
15626 return true;
15627 return false;
15628}
15629
15630static int getAlignedAGPRClassID(unsigned UnalignedClassID) {
15631 switch (UnalignedClassID) {
15632 case AMDGPU::VReg_64RegClassID:
15633 return AMDGPU::VReg_64_Align2RegClassID;
15634 case AMDGPU::VReg_96RegClassID:
15635 return AMDGPU::VReg_96_Align2RegClassID;
15636 case AMDGPU::VReg_128RegClassID:
15637 return AMDGPU::VReg_128_Align2RegClassID;
15638 case AMDGPU::VReg_160RegClassID:
15639 return AMDGPU::VReg_160_Align2RegClassID;
15640 case AMDGPU::VReg_192RegClassID:
15641 return AMDGPU::VReg_192_Align2RegClassID;
15642 case AMDGPU::VReg_224RegClassID:
15643 return AMDGPU::VReg_224_Align2RegClassID;
15644 case AMDGPU::VReg_256RegClassID:
15645 return AMDGPU::VReg_256_Align2RegClassID;
15646 case AMDGPU::VReg_288RegClassID:
15647 return AMDGPU::VReg_288_Align2RegClassID;
15648 case AMDGPU::VReg_320RegClassID:
15649 return AMDGPU::VReg_320_Align2RegClassID;
15650 case AMDGPU::VReg_352RegClassID:
15651 return AMDGPU::VReg_352_Align2RegClassID;
15652 case AMDGPU::VReg_384RegClassID:
15653 return AMDGPU::VReg_384_Align2RegClassID;
15654 case AMDGPU::VReg_512RegClassID:
15655 return AMDGPU::VReg_512_Align2RegClassID;
15656 case AMDGPU::VReg_1024RegClassID:
15657 return AMDGPU::VReg_1024_Align2RegClassID;
15658 case AMDGPU::AReg_64RegClassID:
15659 return AMDGPU::AReg_64_Align2RegClassID;
15660 case AMDGPU::AReg_96RegClassID:
15661 return AMDGPU::AReg_96_Align2RegClassID;
15662 case AMDGPU::AReg_128RegClassID:
15663 return AMDGPU::AReg_128_Align2RegClassID;
15664 case AMDGPU::AReg_160RegClassID:
15665 return AMDGPU::AReg_160_Align2RegClassID;
15666 case AMDGPU::AReg_192RegClassID:
15667 return AMDGPU::AReg_192_Align2RegClassID;
15668 case AMDGPU::AReg_256RegClassID:
15669 return AMDGPU::AReg_256_Align2RegClassID;
15670 case AMDGPU::AReg_512RegClassID:
15671 return AMDGPU::AReg_512_Align2RegClassID;
15672 case AMDGPU::AReg_1024RegClassID:
15673 return AMDGPU::AReg_1024_Align2RegClassID;
15674 default:
15675 return -1;
15676 }
15677}
15678
15679// Figure out which registers should be reserved for stack access. Only after
15680// the function is legalized do we know all of the non-spill stack objects or if
15681// calls are present.
15685 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
15686 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
15687 const SIInstrInfo *TII = ST.getInstrInfo();
15688
15689 if (Info->isEntryFunction()) {
15690 // Callable functions have fixed registers used for stack access.
15692 }
15693
15694 // TODO: Move this logic to getReservedRegs()
15695 // Reserve the SGPR(s) to save/restore EXEC for WWM spill/copy handling.
15696 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
15697 Register SReg = ST.isWave32()
15698 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
15699 : TRI->getAlignedHighSGPRForRC(MF, /*Align=*/2,
15700 &AMDGPU::SGPR_64RegClass);
15701 Info->setSGPRForEXECCopy(SReg);
15702
15703 assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
15704 Info->getStackPtrOffsetReg()));
15705 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
15706 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
15707
15708 // We need to worry about replacing the default register with itself in case
15709 // of MIR testcases missing the MFI.
15710 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
15711 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
15712
15713 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
15714 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
15715
15716 Info->limitOccupancy(MF);
15717
15718 if (ST.isWave32() && !MF.empty()) {
15719 for (auto &MBB : MF) {
15720 for (auto &MI : MBB) {
15721 TII->fixImplicitOperands(MI);
15722 }
15723 }
15724 }
15725
15726 // FIXME: This is a hack to fixup AGPR classes to use the properly aligned
15727 // classes if required. Ideally the register class constraints would differ
15728 // per-subtarget, but there's no easy way to achieve that right now. This is
15729 // not a problem for VGPRs because the correctly aligned VGPR class is implied
15730 // from using them as the register class for legal types.
15731 if (ST.needsAlignedVGPRs()) {
15732 for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
15733 const Register Reg = Register::index2VirtReg(I);
15734 const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
15735 if (!RC)
15736 continue;
15737 int NewClassID = getAlignedAGPRClassID(RC->getID());
15738 if (NewClassID != -1)
15739 MRI.setRegClass(Reg, TRI->getRegClass(NewClassID));
15740 }
15741 }
15742
15744}
15745
15747 KnownBits &Known,
15748 const APInt &DemandedElts,
15749 const SelectionDAG &DAG,
15750 unsigned Depth) const {
15751 Known.resetAll();
15752 unsigned Opc = Op.getOpcode();
15753 switch (Opc) {
15755 unsigned IID = Op.getConstantOperandVal(0);
15756 switch (IID) {
15757 case Intrinsic::amdgcn_mbcnt_lo:
15758 case Intrinsic::amdgcn_mbcnt_hi: {
15759 const GCNSubtarget &ST =
15761 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
15762 // most 31 + src1.
15763 Known.Zero.setBitsFrom(
15764 IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);
15765 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
15766 Known = KnownBits::add(Known, Known2);
15767 return;
15768 }
15769 }
15770 break;
15771 }
15772 }
15774 Op, Known, DemandedElts, DAG, Depth);
15775}
15776
15778 const int FI, KnownBits &Known, const MachineFunction &MF) const {
15780
15781 // Set the high bits to zero based on the maximum allowed scratch size per
15782 // wave. We can't use vaddr in MUBUF instructions if we don't know the address
15783 // calculation won't overflow, so assume the sign bit is never set.
15784 Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
15785}
15786
15788 KnownBits &Known, unsigned Dim) {
15789 unsigned MaxValue =
15790 ST.getMaxWorkitemID(KB.getMachineFunction().getFunction(), Dim);
15791 Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
15792}
15793
15795 GISelKnownBits &KB, Register R, KnownBits &Known, const APInt &DemandedElts,
15796 const MachineRegisterInfo &MRI, unsigned Depth) const {
15797 const MachineInstr *MI = MRI.getVRegDef(R);
15798 switch (MI->getOpcode()) {
15799 case AMDGPU::G_INTRINSIC:
15800 case AMDGPU::G_INTRINSIC_CONVERGENT: {
15801 Intrinsic::ID IID = cast<GIntrinsic>(MI)->getIntrinsicID();
15802 switch (IID) {
15803 case Intrinsic::amdgcn_workitem_id_x:
15804 knownBitsForWorkitemID(*getSubtarget(), KB, Known, 0);
15805 break;
15806 case Intrinsic::amdgcn_workitem_id_y:
15807 knownBitsForWorkitemID(*getSubtarget(), KB, Known, 1);
15808 break;
15809 case Intrinsic::amdgcn_workitem_id_z:
15810 knownBitsForWorkitemID(*getSubtarget(), KB, Known, 2);
15811 break;
15812 case Intrinsic::amdgcn_mbcnt_lo:
15813 case Intrinsic::amdgcn_mbcnt_hi: {
15814 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
15815 // most 31 + src1.
15816 Known.Zero.setBitsFrom(IID == Intrinsic::amdgcn_mbcnt_lo
15817 ? getSubtarget()->getWavefrontSizeLog2()
15818 : 5);
15819 KnownBits Known2;
15820 KB.computeKnownBitsImpl(MI->getOperand(3).getReg(), Known2, DemandedElts,
15821 Depth + 1);
15822 Known = KnownBits::add(Known, Known2);
15823 break;
15824 }
15825 case Intrinsic::amdgcn_groupstaticsize: {
15826 // We can report everything over the maximum size as 0. We can't report
15827 // based on the actual size because we don't know if it's accurate or not
15828 // at any given point.
15829 Known.Zero.setHighBits(
15830 llvm::countl_zero(getSubtarget()->getAddressableLocalMemorySize()));
15831 break;
15832 }
15833 }
15834 break;
15835 }
15836 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
15837 Known.Zero.setHighBits(24);
15838 break;
15839 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
15840 Known.Zero.setHighBits(16);
15841 break;
15842 case AMDGPU::G_AMDGPU_SMED3:
15843 case AMDGPU::G_AMDGPU_UMED3: {
15844 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
15845
15846 KnownBits Known2;
15847 KB.computeKnownBitsImpl(Src2, Known2, DemandedElts, Depth + 1);
15848 if (Known2.isUnknown())
15849 break;
15850
15851 KnownBits Known1;
15852 KB.computeKnownBitsImpl(Src1, Known1, DemandedElts, Depth + 1);
15853 if (Known1.isUnknown())
15854 break;
15855
15856 KnownBits Known0;
15857 KB.computeKnownBitsImpl(Src0, Known0, DemandedElts, Depth + 1);
15858 if (Known0.isUnknown())
15859 break;
15860
15861 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
15862 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
15863 Known.One = Known0.One & Known1.One & Known2.One;
15864 break;
15865 }
15866 }
15867}
15868
15871 unsigned Depth) const {
15872 const MachineInstr *MI = MRI.getVRegDef(R);
15873 if (auto *GI = dyn_cast<GIntrinsic>(MI)) {
15874 // FIXME: Can this move to generic code? What about the case where the call
15875 // site specifies a lower alignment?
15876 Intrinsic::ID IID = GI->getIntrinsicID();
15878 AttributeList Attrs = Intrinsic::getAttributes(Ctx, IID);
15879 if (MaybeAlign RetAlign = Attrs.getRetAlignment())
15880 return *RetAlign;
15881 }
15882 return Align(1);
15883}
15884
15887 const Align CacheLineAlign = Align(64);
15888
15889 // Pre-GFX10 target did not benefit from loop alignment
15890 if (!ML || DisableLoopAlignment || !getSubtarget()->hasInstPrefetch() ||
15891 getSubtarget()->hasInstFwdPrefetchBug())
15892 return PrefAlign;
15893
15894 // On GFX10 I$ is 4 x 64 bytes cache lines.
15895 // By default prefetcher keeps one cache line behind and reads two ahead.
15896 // We can modify it with S_INST_PREFETCH for larger loops to have two lines
15897 // behind and one ahead.
15898 // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
15899 // If loop fits 64 bytes it always spans no more than two cache lines and
15900 // does not need an alignment.
15901 // Else if loop is less or equal 128 bytes we do not need to modify prefetch,
15902 // Else if loop is less or equal 192 bytes we need two lines behind.
15903
15905 const MachineBasicBlock *Header = ML->getHeader();
15906 if (Header->getAlignment() != PrefAlign)
15907 return Header->getAlignment(); // Already processed.
15908
15909 unsigned LoopSize = 0;
15910 for (const MachineBasicBlock *MBB : ML->blocks()) {
15911 // If inner loop block is aligned assume in average half of the alignment
15912 // size to be added as nops.
15913 if (MBB != Header)
15914 LoopSize += MBB->getAlignment().value() / 2;
15915
15916 for (const MachineInstr &MI : *MBB) {
15917 LoopSize += TII->getInstSizeInBytes(MI);
15918 if (LoopSize > 192)
15919 return PrefAlign;
15920 }
15921 }
15922
15923 if (LoopSize <= 64)
15924 return PrefAlign;
15925
15926 if (LoopSize <= 128)
15927 return CacheLineAlign;
15928
15929 // If any of parent loops is surrounded by prefetch instructions do not
15930 // insert new for inner loop, which would reset parent's settings.
15931 for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) {
15932 if (MachineBasicBlock *Exit = P->getExitBlock()) {
15933 auto I = Exit->getFirstNonDebugInstr();
15934 if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH)
15935 return CacheLineAlign;
15936 }
15937 }
15938
15939 MachineBasicBlock *Pre = ML->getLoopPreheader();
15940 MachineBasicBlock *Exit = ML->getExitBlock();
15941
15942 if (Pre && Exit) {
15943 auto PreTerm = Pre->getFirstTerminator();
15944 if (PreTerm == Pre->begin() ||
15945 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
15946 BuildMI(*Pre, PreTerm, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
15947 .addImm(1); // prefetch 2 lines behind PC
15948
15949 auto ExitHead = Exit->getFirstNonDebugInstr();
15950 if (ExitHead == Exit->end() ||
15951 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
15952 BuildMI(*Exit, ExitHead, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
15953 .addImm(2); // prefetch 1 line behind PC
15954 }
15955
15956 return CacheLineAlign;
15957}
15958
15960static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
15961 assert(N->getOpcode() == ISD::CopyFromReg);
15962 do {
15963 // Follow the chain until we find an INLINEASM node.
15964 N = N->getOperand(0).getNode();
15965 if (N->getOpcode() == ISD::INLINEASM ||
15966 N->getOpcode() == ISD::INLINEASM_BR)
15967 return true;
15968 } while (N->getOpcode() == ISD::CopyFromReg);
15969 return false;
15970}
15971
15974 UniformityInfo *UA) const {
15975 switch (N->getOpcode()) {
15976 case ISD::CopyFromReg: {
15977 const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1));
15978 const MachineRegisterInfo &MRI = FLI->MF->getRegInfo();
15979 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
15980 Register Reg = R->getReg();
15981
15982 // FIXME: Why does this need to consider isLiveIn?
15983 if (Reg.isPhysical() || MRI.isLiveIn(Reg))
15984 return !TRI->isSGPRReg(MRI, Reg);
15985
15986 if (const Value *V = FLI->getValueFromVirtualReg(R->getReg()))
15987 return UA->isDivergent(V);
15988
15990 return !TRI->isSGPRReg(MRI, Reg);
15991 }
15992 case ISD::LOAD: {
15993 const LoadSDNode *L = cast<LoadSDNode>(N);
15994 unsigned AS = L->getAddressSpace();
15995 // A flat load may access private memory.
15997 }
15998 case ISD::CALLSEQ_END:
15999 return true;
16001 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(0));
16003 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(1));
16022 // Target-specific read-modify-write atomics are sources of divergence.
16023 return true;
16024 default:
16025 if (auto *A = dyn_cast<AtomicSDNode>(N)) {
16026 // Generic read-modify-write atomics are sources of divergence.
16027 return A->readMem() && A->writeMem();
16028 }
16029 return false;
16030 }
16031}
16032
16034 EVT VT) const {
16035 switch (VT.getScalarType().getSimpleVT().SimpleTy) {
16036 case MVT::f32:
16038 case MVT::f64:
16039 case MVT::f16:
16041 default:
16042 return false;
16043 }
16044}
16045
16047 LLT Ty, const MachineFunction &MF) const {
16048 switch (Ty.getScalarSizeInBits()) {
16049 case 32:
16050 return !denormalModeIsFlushAllF32(MF);
16051 case 64:
16052 case 16:
16053 return !denormalModeIsFlushAllF64F16(MF);
16054 default:
16055 return false;
16056 }
16057}
16058
16060 const SelectionDAG &DAG,
16061 bool SNaN,
16062 unsigned Depth) const {
16063 if (Op.getOpcode() == AMDGPUISD::CLAMP) {
16064 const MachineFunction &MF = DAG.getMachineFunction();
16066
16067 if (Info->getMode().DX10Clamp)
16068 return true; // Clamped to 0.
16069 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
16070 }
16071
16073 SNaN, Depth);
16074}
16075
16076// On older subtargets, global FP atomic instructions have a hardcoded FP mode
16077// and do not support FP32 denormals, and only support v2f16/f64 denormals.
16079 if (RMW->hasMetadata("amdgpu.ignore.denormal.mode"))
16080 return true;
16081
16083 auto DenormMode = RMW->getFunction()->getDenormalMode(Flt);
16084 if (DenormMode == DenormalMode::getPreserveSign())
16085 return true;
16086
16087 // TODO: Remove this.
16088 return RMW->getFunction()
16089 ->getFnAttribute("amdgpu-unsafe-fp-atomics")
16090 .getValueAsBool();
16091}
16092
16094 LLVMContext &Ctx = RMW->getContext();
16096 Ctx.getSyncScopeNames(SSNs);
16097 StringRef MemScope = SSNs[RMW->getSyncScopeID()].empty()
16098 ? "system"
16099 : SSNs[RMW->getSyncScopeID()];
16100
16101 return OptimizationRemark(DEBUG_TYPE, "Passed", RMW)
16102 << "Hardware instruction generated for atomic "
16103 << RMW->getOperationName(RMW->getOperation())
16104 << " operation at memory scope " << MemScope;
16105}
16106
16107static bool isV2F16OrV2BF16(Type *Ty) {
16108 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
16109 Type *EltTy = VT->getElementType();
16110 return VT->getNumElements() == 2 &&
16111 (EltTy->isHalfTy() || EltTy->isBFloatTy());
16112 }
16113
16114 return false;
16115}
16116
16117static bool isV2F16(Type *Ty) {
16118 FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty);
16119 return VT && VT->getNumElements() == 2 && VT->getElementType()->isHalfTy();
16120}
16121
16122static bool isV2BF16(Type *Ty) {
16123 FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty);
16124 return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy();
16125}
16126
16127/// \return true if atomicrmw integer ops work for the type.
16128static bool isAtomicRMWLegalIntTy(Type *Ty) {
16129 if (auto *IT = dyn_cast<IntegerType>(Ty)) {
16130 unsigned BW = IT->getBitWidth();
16131 return BW == 32 || BW == 64;
16132 }
16133
16134 return false;
16135}
16136
16137/// \return true if this atomicrmw xchg type can be selected.
16138static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW) {
16139 Type *Ty = RMW->getType();
16140 if (isAtomicRMWLegalIntTy(Ty))
16141 return true;
16142
16143 if (PointerType *PT = dyn_cast<PointerType>(Ty)) {
16144 const DataLayout &DL = RMW->getFunction()->getParent()->getDataLayout();
16145 unsigned BW = DL.getPointerSizeInBits(PT->getAddressSpace());
16146 return BW == 32 || BW == 64;
16147 }
16148
16149 if (Ty->isFloatTy() || Ty->isDoubleTy())
16150 return true;
16151
16152 if (FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty)) {
16153 return VT->getNumElements() == 2 &&
16154 VT->getElementType()->getPrimitiveSizeInBits() == 16;
16155 }
16156
16157 return false;
16158}
16159
16160/// \returns true if it's valid to emit a native instruction for \p RMW, based
16161/// on the properties of the target memory.
16162static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget,
16163 const AtomicRMWInst *RMW,
16164 bool HasSystemScope) {
16165 // The remote/fine-grained access logic is different from the integer
16166 // atomics. Without AgentScopeFineGrainedRemoteMemoryAtomics support,
16167 // fine-grained access does not work, even for a device local allocation.
16168 //
16169 // With AgentScopeFineGrainedRemoteMemoryAtomics, system scoped device local
16170 // allocations work.
16171 if (HasSystemScope) {
16173 RMW->hasMetadata("amdgpu.no.remote.memory"))
16174 return true;
16176 return true;
16177
16178 return RMW->hasMetadata("amdgpu.no.fine.grained.memory");
16179}
16180
16181/// \return Action to perform on AtomicRMWInsts for integer operations.
16184 return isAtomicRMWLegalIntTy(RMW->getType())
16187}
16188
16191 unsigned AS = RMW->getPointerAddressSpace();
16192 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
16194
16195 auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
16197 ORE.emit([=]() {
16198 return emitAtomicRMWLegalRemark(RMW) << " due to an unsafe request.";
16199 });
16200 return Kind;
16201 };
16202
16203 auto SSID = RMW->getSyncScopeID();
16204 bool HasSystemScope =
16205 SSID == SyncScope::System ||
16206 SSID == RMW->getContext().getOrInsertSyncScopeID("one-as");
16207
16208 auto Op = RMW->getOperation();
16209 switch (Op) {
16210 case AtomicRMWInst::Xchg: {
16211 // PCIe supports add and xchg for system atomics.
16212 return isAtomicRMWLegalXChgTy(RMW)
16215
16216 // PCIe supports add and xchg for system atomics.
16218 }
16219 case AtomicRMWInst::Add:
16220 case AtomicRMWInst::And:
16224 case AtomicRMWInst::Sub:
16225 case AtomicRMWInst::Or:
16226 case AtomicRMWInst::Xor: {
16227 // Atomic sub/or/xor do not work over PCI express, but atomic add
16228 // does. InstCombine transforms these with 0 to or, so undo that.
16229 if (HasSystemScope && AMDGPU::isFlatGlobalAddrSpace(AS)) {
16230 if (Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand());
16231 ConstVal && ConstVal->isNullValue())
16233 }
16234
16236 }
16237 case AtomicRMWInst::FAdd: {
16238 Type *Ty = RMW->getType();
16239
16240 // TODO: Handle REGION_ADDRESS
16241 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
16242 // DS F32 FP atomics do respect the denormal mode, but the rounding mode
16243 // is fixed to round-to-nearest-even.
16244 //
16245 // F64 / PK_F16 / PK_BF16 never flush and are also fixed to
16246 // round-to-nearest-even.
16247 //
16248 // We ignore the rounding mode problem, even in strictfp. The C++ standard
16249 // suggests it is OK if the floating-point mode may not match the calling
16250 // thread.
16251 if (Ty->isFloatTy()) {
16254 }
16255
16256 if (Ty->isDoubleTy()) {
16257 // Ignores denormal mode, but we don't consider flushing mandatory.
16260 }
16261
16262 if (Subtarget->hasAtomicDsPkAdd16Insts() && isV2F16OrV2BF16(Ty))
16264
16266 }
16267
16268 // LDS atomics respect the denormal mode from the mode register.
16269 //
16270 // Traditionally f32 global/buffer memory atomics would unconditionally
16271 // flush denormals, but newer targets do not flush. f64/f16/bf16 cases never
16272 // flush.
16273 //
16274 // On targets with flat atomic fadd, denormals would flush depending on
16275 // whether the target address resides in LDS or global memory. We consider
16276 // this flat-maybe-flush as will-flush.
16277 if (Ty->isFloatTy() &&
16281
16282 // FIXME: These ReportUnsafeHWInsts are imprecise. Some of these cases are
16283 // safe. The message phrasing also should be better.
16284 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
16285 if (AS == AMDGPUAS::FLAT_ADDRESS) {
16286 // gfx940, gfx12
16287 if (Subtarget->hasAtomicFlatPkAdd16Insts() && isV2F16OrV2BF16(Ty))
16288 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16289 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
16290 // gfx90a, gfx940, gfx12
16291 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
16292 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16293
16294 // gfx940, gfx12
16295 if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isV2BF16(Ty))
16296 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16297 } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
16298 // gfx90a, gfx940, gfx12
16299 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
16300 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16301
16302 // While gfx90a/gfx940 supports v2bf16 for global/flat, it does not for
16303 // buffer. gfx12 does have the buffer version.
16304 if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isV2BF16(Ty))
16305 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16306 }
16307
16308 // global and flat atomic fadd f64: gfx90a, gfx940.
16309 if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
16310 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16311
16312 if (AS != AMDGPUAS::FLAT_ADDRESS) {
16313 if (Ty->isFloatTy()) {
16314 // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940,
16315 // gfx11+.
16316 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16317 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16318 // global/buffer atomic fadd f32 rtn: gfx90a, gfx940, gfx11+.
16319 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16320 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16321 } else {
16322 // gfx908
16323 if (RMW->use_empty() &&
16325 isV2F16(Ty))
16326 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16327 }
16328 }
16329
16330 // flat atomic fadd f32: gfx940, gfx11+.
16331 if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
16332 if (Subtarget->hasFlatAtomicFaddF32Inst())
16333 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16334
16335 // If it is in flat address space, and the type is float, we will try to
16336 // expand it, if the target supports global and lds atomic fadd. The
16337 // reason we need that is, in the expansion, we emit the check of
16338 // address space. If it is in global address space, we emit the global
16339 // atomic fadd; if it is in shared address space, we emit the LDS atomic
16340 // fadd.
16341 if (Subtarget->hasLDSFPAtomicAddF32()) {
16342 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16344 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16346 }
16347 }
16348 }
16349
16351 }
16353 case AtomicRMWInst::FMax: {
16354 Type *Ty = RMW->getType();
16355
16356 // LDS float and double fmin/fmax were always supported.
16357 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
16358 return Ty->isFloatTy() || Ty->isDoubleTy() ? AtomicExpansionKind::None
16360 }
16361
16362 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
16363 // For flat and global cases:
16364 // float, double in gfx7. Manual claims denormal support.
16365 // Removed in gfx8.
16366 // float, double restored in gfx10.
16367 // double removed again in gfx11, so only f32 for gfx11/gfx12.
16368 //
16369 // For gfx9, gfx90a and gfx940 support f64 for global (same as fadd), but
16370 // no f32.
16371 if (AS == AMDGPUAS::FLAT_ADDRESS) {
16372 if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
16373 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16374 if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
16375 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16376 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS) ||
16378 if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
16379 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16380 if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
16381 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16382 }
16383 }
16384
16386 }
16387 case AtomicRMWInst::Min:
16388 case AtomicRMWInst::Max:
16390 case AtomicRMWInst::UMax: {
16393 // Always expand system scope min/max atomics.
16394 if (HasSystemScope)
16396 }
16397
16399 }
16402 default:
16404 }
16405
16406 llvm_unreachable("covered atomicrmw op switch");
16407}
16408
16414}
16415
16418 return SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
16421}
16422
16428}
16429
16430const TargetRegisterClass *
16431SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
16433 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
16434 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
16435 return Subtarget->getWavefrontSize() == 64 ? &AMDGPU::SReg_64RegClass
16436 : &AMDGPU::SReg_32RegClass;
16437 if (!TRI->isSGPRClass(RC) && !isDivergent)
16438 return TRI->getEquivalentSGPRClass(RC);
16439 if (TRI->isSGPRClass(RC) && isDivergent)
16440 return TRI->getEquivalentVGPRClass(RC);
16441
16442 return RC;
16443}
16444
16445// FIXME: This is a workaround for DivergenceAnalysis not understanding always
16446// uniform values (as produced by the mask results of control flow intrinsics)
16447// used outside of divergent blocks. The phi users need to also be treated as
16448// always uniform.
16449//
16450// FIXME: DA is no longer in-use. Does this still apply to UniformityAnalysis?
16451static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited,
16452 unsigned WaveSize) {
16453 // FIXME: We assume we never cast the mask results of a control flow
16454 // intrinsic.
16455 // Early exit if the type won't be consistent as a compile time hack.
16456 IntegerType *IT = dyn_cast<IntegerType>(V->getType());
16457 if (!IT || IT->getBitWidth() != WaveSize)
16458 return false;
16459
16460 if (!isa<Instruction>(V))
16461 return false;
16462 if (!Visited.insert(V).second)
16463 return false;
16464 bool Result = false;
16465 for (const auto *U : V->users()) {
16466 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(U)) {
16467 if (V == U->getOperand(1)) {
16468 switch (Intrinsic->getIntrinsicID()) {
16469 default:
16470 Result = false;
16471 break;
16472 case Intrinsic::amdgcn_if_break:
16473 case Intrinsic::amdgcn_if:
16474 case Intrinsic::amdgcn_else:
16475 Result = true;
16476 break;
16477 }
16478 }
16479 if (V == U->getOperand(0)) {
16480 switch (Intrinsic->getIntrinsicID()) {
16481 default:
16482 Result = false;
16483 break;
16484 case Intrinsic::amdgcn_end_cf:
16485 case Intrinsic::amdgcn_loop:
16486 Result = true;
16487 break;
16488 }
16489 }
16490 } else {
16491 Result = hasCFUser(U, Visited, WaveSize);
16492 }
16493 if (Result)
16494 break;
16495 }
16496 return Result;
16497}
16498
16500 const Value *V) const {
16501 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
16502 if (CI->isInlineAsm()) {
16503 // FIXME: This cannot give a correct answer. This should only trigger in
16504 // the case where inline asm returns mixed SGPR and VGPR results, used
16505 // outside the defining block. We don't have a specific result to
16506 // consider, so this assumes if any value is SGPR, the overall register
16507 // also needs to be SGPR.
16508 const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
16510 MF.getDataLayout(), Subtarget->getRegisterInfo(), *CI);
16511 for (auto &TC : TargetConstraints) {
16512 if (TC.Type == InlineAsm::isOutput) {
16515 SIRI, TC.ConstraintCode, TC.ConstraintVT).second;
16516 if (RC && SIRI->isSGPRClass(RC))
16517 return true;
16518 }
16519 }
16520 }
16521 }
16523 return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
16524}
16525
16527 SDNode::use_iterator I = N->use_begin(), E = N->use_end();
16528 for (; I != E; ++I) {
16529 if (MemSDNode *M = dyn_cast<MemSDNode>(*I)) {
16530 if (getBasePtrIndex(M) == I.getOperandNo())
16531 return true;
16532 }
16533 }
16534 return false;
16535}
16536
16538 SDValue N1) const {
16539 if (!N0.hasOneUse())
16540 return false;
16541 // Take care of the opportunity to keep N0 uniform
16542 if (N0->isDivergent() || !N1->isDivergent())
16543 return true;
16544 // Check if we have a good chance to form the memory access pattern with the
16545 // base and offset
16546 return (DAG.isBaseWithConstantOffset(N0) &&
16547 hasMemSDNodeUser(*N0->use_begin()));
16548}
16549
16551 Register N0, Register N1) const {
16552 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
16553}
16554
16557 // Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load.
16559 if (I.getMetadata("amdgpu.noclobber"))
16560 Flags |= MONoClobber;
16561 if (I.getMetadata("amdgpu.last.use"))
16562 Flags |= MOLastUse;
16563 return Flags;
16564}
16565
16567 SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI,
16568 const TargetInstrInfo *TII, unsigned &PhysReg, int &Cost) const {
16569 if (User->getOpcode() != ISD::CopyToReg)
16570 return false;
16571 if (!Def->isMachineOpcode())
16572 return false;
16573 MachineSDNode *MDef = dyn_cast<MachineSDNode>(Def);
16574 if (!MDef)
16575 return false;
16576
16577 unsigned ResNo = User->getOperand(Op).getResNo();
16578 if (User->getOperand(Op)->getValueType(ResNo) != MVT::i1)
16579 return false;
16580 const MCInstrDesc &II = TII->get(MDef->getMachineOpcode());
16581 if (II.isCompare() && II.hasImplicitDefOfPhysReg(AMDGPU::SCC)) {
16582 PhysReg = AMDGPU::SCC;
16583 const TargetRegisterClass *RC =
16584 TRI->getMinimalPhysRegClass(PhysReg, Def->getSimpleValueType(ResNo));
16585 Cost = RC->getCopyCost();
16586 return true;
16587 }
16588 return false;
16589}
16590
16593
16596 // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16597 assert(cast<Constant>(AI->getValOperand())->isNullValue() &&
16598 "this cannot be replaced with add");
16600 return;
16601 }
16602
16603 assert(Subtarget->hasAtomicFaddInsts() &&
16604 "target should have atomic fadd instructions");
16605 assert(AI->getType()->isFloatTy() &&
16607 "generic atomicrmw expansion only supports FP32 operand in flat "
16608 "address space");
16609 assert(Op == AtomicRMWInst::FAdd && "only fadd is supported for now");
16610
16611 // Given: atomicrmw fadd ptr %addr, float %val ordering
16612 //
16613 // With this expansion we produce the following code:
16614 // [...]
16615 // %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %addr)
16616 // br i1 %is.shared, label %atomicrmw.shared, label %atomicrmw.check.private
16617 //
16618 // atomicrmw.shared:
16619 // %cast.shared = addrspacecast ptr %addr to ptr addrspace(3)
16620 // %loaded.shared = atomicrmw fadd ptr addrspace(3) %cast.shared,
16621 // float %val ordering
16622 // br label %atomicrmw.phi
16623 //
16624 // atomicrmw.check.private:
16625 // %is.private = call i1 @llvm.amdgcn.is.private(ptr %int8ptr)
16626 // br i1 %is.private, label %atomicrmw.private, label %atomicrmw.global
16627 //
16628 // atomicrmw.private:
16629 // %cast.private = addrspacecast ptr %addr to ptr addrspace(5)
16630 // %loaded.private = load float, ptr addrspace(5) %cast.private
16631 // %val.new = fadd float %loaded.private, %val
16632 // store float %val.new, ptr addrspace(5) %cast.private
16633 // br label %atomicrmw.phi
16634 //
16635 // atomicrmw.global:
16636 // %cast.global = addrspacecast ptr %addr to ptr addrspace(1)
16637 // %loaded.global = atomicrmw fadd ptr addrspace(1) %cast.global,
16638 // float %val ordering
16639 // br label %atomicrmw.phi
16640 //
16641 // atomicrmw.phi:
16642 // %loaded.phi = phi float [ %loaded.shared, %atomicrmw.shared ],
16643 // [ %loaded.private, %atomicrmw.private ],
16644 // [ %loaded.global, %atomicrmw.global ]
16645 // br label %atomicrmw.end
16646 //
16647 // atomicrmw.end:
16648 // [...]
16649
16650 IRBuilder<> Builder(AI);
16651 LLVMContext &Ctx = Builder.getContext();
16652
16653 // If the return value isn't used, do not introduce a false use in the phi.
16654 bool ReturnValueIsUsed = !AI->use_empty();
16655
16656 BasicBlock *BB = Builder.GetInsertBlock();
16657 Function *F = BB->getParent();
16658 BasicBlock *ExitBB =
16659 BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
16660 BasicBlock *SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
16661 BasicBlock *CheckPrivateBB =
16662 BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
16663 BasicBlock *PrivateBB =
16664 BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB);
16665 BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
16666 BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB);
16667
16668 Value *Val = AI->getValOperand();
16669 Type *ValTy = Val->getType();
16670 Value *Addr = AI->getPointerOperand();
16671 Align Alignment = AI->getAlign();
16672
16673 std::prev(BB->end())->eraseFromParent();
16674 Builder.SetInsertPoint(BB);
16675 CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared, {},
16676 {Addr}, nullptr, "is.shared");
16677 Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
16678
16679 Builder.SetInsertPoint(SharedBB);
16680 Value *CastToLocal = Builder.CreateAddrSpaceCast(
16682
16683 Instruction *Clone = AI->clone();
16684 Clone->insertInto(SharedBB, SharedBB->end());
16686 .set(CastToLocal);
16687 Instruction *LoadedShared = Clone;
16688
16689 Builder.CreateBr(PhiBB);
16690
16691 Builder.SetInsertPoint(CheckPrivateBB);
16692 CallInst *IsPrivate = Builder.CreateIntrinsic(
16693 Intrinsic::amdgcn_is_private, {}, {Addr}, nullptr, "is.private");
16694 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
16695
16696 Builder.SetInsertPoint(PrivateBB);
16697 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
16699 Value *LoadedPrivate = Builder.CreateAlignedLoad(ValTy, CastToPrivate,
16700 Alignment, "loaded.private");
16701
16702 Value *NewVal = buildAtomicRMWValue(Op, Builder, LoadedPrivate, Val);
16703
16704 Builder.CreateAlignedStore(NewVal, CastToPrivate, Alignment);
16705 Builder.CreateBr(PhiBB);
16706
16707 Builder.SetInsertPoint(GlobalBB);
16708 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
16710 Value *LoadedGlobal = AI;
16711
16713
16714 AI->removeFromParent();
16715 AI->insertInto(GlobalBB, GlobalBB->end());
16716
16717 Builder.CreateBr(PhiBB);
16718
16719 Builder.SetInsertPoint(PhiBB);
16720
16721 if (ReturnValueIsUsed) {
16722 PHINode *Loaded = Builder.CreatePHI(ValTy, 3);
16723 AI->replaceAllUsesWith(Loaded);
16724 Loaded->addIncoming(LoadedShared, SharedBB);
16725 Loaded->addIncoming(LoadedPrivate, PrivateBB);
16726 Loaded->addIncoming(LoadedGlobal, GlobalBB);
16727 Loaded->takeName(AI);
16728 }
16729
16730 Builder.CreateBr(ExitBB);
16731}
16732
16733LoadInst *
16735 IRBuilder<> Builder(AI);
16736 auto Order = AI->getOrdering();
16737
16738 // The optimization removes store aspect of the atomicrmw. Therefore, cache
16739 // must be flushed if the atomic ordering had a release semantics. This is
16740 // not necessary a fence, a release fence just coincides to do that flush.
16741 // Avoid replacing of an atomicrmw with a release semantics.
16742 if (isReleaseOrStronger(Order))
16743 return nullptr;
16744
16745 LoadInst *LI = Builder.CreateAlignedLoad(
16746 AI->getType(), AI->getPointerOperand(), AI->getAlign());
16747 LI->setAtomic(Order, AI->getSyncScopeID());
16748 LI->copyMetadata(*AI);
16749 LI->takeName(AI);
16750 AI->replaceAllUsesWith(LI);
16751 AI->eraseFromParent();
16752 return LI;
16753}
static bool isMul(MachineInstr *MI)
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
static unsigned getIntrinsicID(const SDNode *N)
static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls)
Return true if the calling convention is one that we can guarantee TCO for.
static bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
unsigned Intr
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static const LLT S32
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
basic Basic Alias true
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define LLVM_ATTRIBUTE_UNUSED
Definition: Compiler.h:199
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Addr
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool isSigned(unsigned int Opcode)
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
#define DEBUG_TYPE
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
static bool isUndef(ArrayRef< int > Mask)
IRTranslator LLVM IR MI
iv Induction Variable Users
Definition: IVUsers.cpp:48
static const unsigned MaxDepth
#define RegName(no)
static LVOptions Options
Definition: LVOptions.cpp:25
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
Contains matchers for matching SSA Machine Instructions.
mir Rename Register Operands
unsigned const TargetRegisterInfo * TRI
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
const SmallVectorImpl< MachineOperand > & Cond
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:39
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:57
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:51
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:45
#define FP_DENORM_FLUSH_NONE
Definition: SIDefines.h:1172
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
Definition: SIDefines.h:1169
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static bool isAtomicRMWLegalIntTy(Type *Ty)
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, EVT VT)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget, const AtomicRMWInst *RMW, bool HasSystemScope)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static TargetLowering::AtomicExpansionKind atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW)
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static bool isV2BF16(Type *Ty)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelKnownBits &KB, KnownBits &Known, unsigned Dim)
static LLVM_ATTRIBUTE_UNUSED bool isCopyFromRegOfInlineAsm(const SDNode *N)
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static bool isV2F16OrV2BF16(Type *Ty)
static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static bool isV2F16(Type *Ty)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Interface definition for SIRegisterInfo.
raw_pwrite_stream & OS
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:166
LLVM IR instance of the generic uniformity analysis.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition: Value.cpp:469
static constexpr int Concat[]
Value * RHS
Value * LHS
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo)
static bool isUniformMMO(const MachineMemOperand *MMO)
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
bool hasMadMacF32Insts() const
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
bool hasMadMixInsts() const
unsigned getWavefrontSizeLog2() const
bool has16BitInsts() const
bool isAmdHsaOrMesa(const Function &F) const
bool hasFastFMAF32() const
bool hasTrigReducedRange() const
unsigned getWavefrontSize() const
bool hasInv2PiInlineImm() const
bool hasVOP3PInsts() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold=4)
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
Definition: APFloat.h:1032
opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition: APFloat.cpp:5337
bool isNegative() const
Definition: APFloat.h:1360
APInt bitcastToAPInt() const
Definition: APFloat.h:1266
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
Definition: APFloat.h:1050
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition: APFloat.h:1010
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition: APFloat.h:994
bool isInfinity() const
Definition: APFloat.h:1357
Class for arbitrary precision integers.
Definition: APInt.h:78
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition: APInt.h:1370
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition: APInt.h:1364
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition: APInt.h:236
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition: APInt.h:444
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition: APInt.h:1596
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:274
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition: APInt.h:1215
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition: APInt.h:1199
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160
An instruction that atomically checks whether a specified value is in a memory location,...
Definition: Instructions.h:495
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:632
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:696
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
Definition: Instructions.h:809
static unsigned getPointerOperandIndex()
Definition: Instructions.h:854
BinOp
This enumeration lists the possible modifications atomicrmw can make.
Definition: Instructions.h:708
@ Add
*p = old + v
Definition: Instructions.h:712
@ FAdd
*p = old + v
Definition: Instructions.h:733
@ Min
*p = old <signed v ? old : v
Definition: Instructions.h:726
@ Or
*p = old | v
Definition: Instructions.h:720
@ Sub
*p = old - v
Definition: Instructions.h:714
@ And
*p = old & v
Definition: Instructions.h:716
@ Xor
*p = old ^ v
Definition: Instructions.h:722
@ FSub
*p = old - v
Definition: Instructions.h:736
@ UIncWrap
Increment one up to a maximum value.
Definition: Instructions.h:748
@ Max
*p = old >signed v ? old : v
Definition: Instructions.h:724
@ UMin
*p = old <unsigned v ? old : v
Definition: Instructions.h:730
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
Definition: Instructions.h:744
@ UMax
*p = old >unsigned v ? old : v
Definition: Instructions.h:728
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
Definition: Instructions.h:740
@ UDecWrap
Decrement one until a minimum value or zero.
Definition: Instructions.h:752
@ Nand
*p = ~(old & v)
Definition: Instructions.h:718
Value * getPointerOperand()
Definition: Instructions.h:852
void setOperation(BinOp Operation)
Definition: Instructions.h:803
BinOp getOperation() const
Definition: Instructions.h:787
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
Definition: Instructions.h:843
Value * getValOperand()
Definition: Instructions.h:856
static StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
Definition: Instructions.h:829
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:860
This is an SDNode representing atomic operations.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
MemoryEffects getMemoryEffects() const
Returns memory effects of the function.
bool getValueAsBool() const
Return the attribute's value as a boolean.
Definition: Attributes.cpp:378
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
iterator end()
Definition: BasicBlock.h:461
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition: BasicBlock.h:212
BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
Definition: BasicBlock.cpp:577
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:219
BitVector & set()
Definition: BitVector.h:351
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
Definition: ByteProvider.h:30
static ByteProvider getConstantZero()
Definition: ByteProvider.h:73
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
Definition: ByteProvider.h:66
std::optional< ISelOp > Src
Definition: ByteProvider.h:57
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
bool isRegLoc() const
Register getLocReg() const
LocInfo getLocInfo() const
bool isMemLoc() const
int64_t getLocMemOffset() const
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1465
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
Definition: InstrTypes.h:1551
bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1410
unsigned arg_size() const
Definition: InstrTypes.h:1408
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:757
@ ICMP_NE
not equal
Definition: InstrTypes.h:779
bool isSigned() const
Definition: InstrTypes.h:1007
bool isFPPredicate() const
Definition: InstrTypes.h:864
bool isIntPredicate() const
Definition: InstrTypes.h:865
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
Definition: Constants.h:81
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition: Constants.h:206
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
Definition: Constant.h:42
bool isNullValue() const
Return true if this is the value that would be returned by getNullValue.
Definition: Constants.cpp:90
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Definition: DataLayout.cpp:838
bool isBigEndian() const
Definition: DataLayout.h:196
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:461
A debug info location.
Definition: DebugLoc.h:33
Diagnostic information for unsupported feature in backend.
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:539
unsigned getNumElements() const
Definition: DerivedTypes.h:582
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Register DemoteRegister
DemoteRegister - if CanLowerReturn is false, DemoteRegister is a vreg allocated to hold a pointer to ...
const Value * getValueFromVirtualReg(Register Vreg)
This method is called from TargetLowerinInfo::isSDNodeSourceOfDivergence to get the Value correspondi...
Class to represent function types.
Definition: DerivedTypes.h:103
Type * getParamType(unsigned i) const
Parameter type accessors.
Definition: DerivedTypes.h:135
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition: Function.h:214
iterator_range< arg_iterator > args()
Definition: Function.h:890
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition: Function.cpp:769
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:281
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:380
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Definition: Function.cpp:810
bool hasPrefetch() const
Definition: GCNSubtarget.h:940
bool hasMemoryAtomicFaddF32DenormalSupport() const
Definition: GCNSubtarget.h:883
bool hasD16Images() const
Definition: GCNSubtarget.h:696
bool hasAtomicDsPkAdd16Insts() const
Definition: GCNSubtarget.h:845
bool hasImageStoreD16Bug() const
bool hasUsableDivScaleConditionOutput() const
Condition output from div_scale is usable.
Definition: GCNSubtarget.h:477
bool hasUsableDSOffset() const
True if the offset field of DS instructions works as expected.
Definition: GCNSubtarget.h:468
bool hasAtomicFMinFMaxF64FlatInsts() const
Definition: GCNSubtarget.h:841
bool hasDot7Insts() const
Definition: GCNSubtarget.h:795
bool hasApertureRegs() const
Definition: GCNSubtarget.h:597
bool hasFlatInstOffsets() const
Definition: GCNSubtarget.h:627
bool hasAtomicFMinFMaxF32FlatInsts() const
Definition: GCNSubtarget.h:837
bool hasCompressedExport() const
Return true if the target's EXP instruction has the COMPR flag, which affects the meaning of the EN (...
bool hasGFX90AInsts() const
bool hasDLInsts() const
Definition: GCNSubtarget.h:765
bool hasBCNT(unsigned Size) const
Definition: GCNSubtarget.h:411
bool hasMAIInsts() const
Definition: GCNSubtarget.h:815
bool supportsAgentScopeFineGrainedRemoteMemoryAtomics() const
Definition: GCNSubtarget.h:890
bool hasMultiDwordFlatScratchAddressing() const
Definition: GCNSubtarget.h:676
bool hasArchitectedSGPRs() const
bool hasDenormModeInst() const
Definition: GCNSubtarget.h:527
bool hasPrivEnabledTrap2NopBug() const
bool hasUnalignedDSAccessEnabled() const
Definition: GCNSubtarget.h:585
const SIInstrInfo * getInstrInfo() const override
Definition: GCNSubtarget.h:266
bool hasDot1Insts() const
Definition: GCNSubtarget.h:771
bool hasAtomicFaddRtnInsts() const
Definition: GCNSubtarget.h:853
Align getStackAlignment() const
Definition: GCNSubtarget.h:953
bool hasScalarSubwordLoads() const
Definition: GCNSubtarget.h:455
bool enableFlatScratch() const
Definition: GCNSubtarget.h:652
bool hasDwordx3LoadStores() const
bool hasFlatScrRegister() const
Definition: GCNSubtarget.h:623
bool supportsGetDoorbellID() const
Definition: GCNSubtarget.h:461
bool hasFlatAtomicFaddF32Inst() const
Definition: GCNSubtarget.h:873
bool hasKernargPreload() const
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:278
unsigned getMaxNumVGPRs(unsigned WavesPerEU) const
bool hasMad64_32() const
Definition: GCNSubtarget.h:741
bool useDS128() const
Definition: GCNSubtarget.h:537
bool hasLDSMisalignedBug() const
bool hasUserSGPRInit16Bug() const
TrapHandlerAbi getTrapHandlerAbi() const
Definition: GCNSubtarget.h:457
const SIFrameLowering * getFrameLowering() const override
Definition: GCNSubtarget.h:270
bool hasUnalignedScratchAccess() const
Definition: GCNSubtarget.h:589
bool hasAtomicFMinFMaxF32GlobalInsts() const
Definition: GCNSubtarget.h:829
bool hasRestrictedSOffset() const
bool hasMin3Max3_16() const
Definition: GCNSubtarget.h:427
bool hasIntClamp() const
Definition: GCNSubtarget.h:357
bool hasGFX10_AEncoding() const
bool hasPackedFP32Ops() const
bool hasFullRate64Ops() const
Definition: GCNSubtarget.h:377
bool isTrapHandlerEnabled() const
Definition: GCNSubtarget.h:601
bool hasLDSFPAtomicAddF64() const
bool hasFlatGlobalInsts() const
Definition: GCNSubtarget.h:631
bool getScalarizeGlobalBehavior() const
Definition: GCNSubtarget.h:966
bool hasScalarSMulU64() const
Definition: GCNSubtarget.h:730
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
Definition: GCNSubtarget.h:336
bool hasShaderCyclesHiLoRegisters() const
Definition: GCNSubtarget.h:920
bool hasFFBL() const
Definition: GCNSubtarget.h:415
bool hasNSAEncoding() const
bool hasSMemRealTime() const
Definition: GCNSubtarget.h:985
bool usePRTStrictNull() const
Definition: GCNSubtarget.h:559
bool hasAtomicFMinFMaxF64GlobalInsts() const
Definition: GCNSubtarget.h:833
bool hasMed3_16() const
Definition: GCNSubtarget.h:423
bool hasMovrel() const
Definition: GCNSubtarget.h:989
bool hasAtomicFlatPkAdd16Insts() const
Definition: GCNSubtarget.h:847
bool hasBFI() const
Definition: GCNSubtarget.h:403
bool hasUnalignedBufferAccessEnabled() const
Definition: GCNSubtarget.h:577
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
Definition: GCNSubtarget.h:344
bool hasImageGather4D16Bug() const
bool supportsMinMaxDenormModes() const
Definition: GCNSubtarget.h:522
bool hasFFBH() const
Definition: GCNSubtarget.h:419
bool hasAtomicFaddInsts() const
Definition: GCNSubtarget.h:849
unsigned getNSAMaxSize(bool HasSampler=false) const
bool hasAtomicBufferGlobalPkAddF16NoRtnInsts() const
Definition: GCNSubtarget.h:857
bool hasAtomicBufferPkAddBF16Inst() const
Definition: GCNSubtarget.h:869
bool hasAtomicFaddNoRtnInsts() const
Definition: GCNSubtarget.h:855
bool hasFlatBufferGlobalAtomicFaddF64Inst() const
Definition: GCNSubtarget.h:877
bool hasScalarDwordx3Loads() const
bool hasLDSFPAtomicAddF32() const
bool haveRoundOpsF64() const
Have v_trunc_f64, v_ceil_f64, v_rndne_f64.
Definition: GCNSubtarget.h:547
bool hasDot8Insts() const
Definition: GCNSubtarget.h:799
bool hasDS96AndDS128() const
Definition: GCNSubtarget.h:542
bool useFlatForGlobal() const
Definition: GCNSubtarget.h:531
Generation getGeneration() const
Definition: GCNSubtarget.h:317
bool hasAtomicBufferGlobalPkAddF16Insts() const
Definition: GCNSubtarget.h:861
bool hasScalarAddSub64() const
Definition: GCNSubtarget.h:728
bool hasIEEEMinMax3() const
bool hasUnpackedD16VMem() const
Definition: GCNSubtarget.h:732
bool hasAtomicGlobalPkAddBF16Inst() const
Definition: GCNSubtarget.h:865
bool hasAddr64() const
Definition: GCNSubtarget.h:381
bool hasIEEEMinMax() const
bool hasFmaMixInsts() const
Definition: GCNSubtarget.h:431
bool hasPackedTID() const
bool hasAddNoCarry() const
Definition: GCNSubtarget.h:724
bool hasFractBug() const
Definition: GCNSubtarget.h:395
bool hasGDS() const
bool hasBFE() const
Definition: GCNSubtarget.h:399
bool hasGWSAutoReplay() const
Definition: GCNSubtarget.h:711
bool hasKernargSegmentPtr() const
bool hasPrivateSegmentBuffer() const
bool hasImplicitBufferPtr() const
bool hasPrivateSegmentSize() const
virtual void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
const MachineFunction & getMachineFunction() const
bool isDivergent(ConstValueRefT V) const
Whether V is divergent at its definition.
unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
Definition: GlobalValue.h:511
unsigned getAddressSpace() const
Definition: GlobalValue.h:205
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:656
Type * getValueType() const
Definition: GlobalValue.h:296
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition: IRBuilder.h:1824
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:933
BasicBlock::iterator GetInsertPoint() const
Definition: IRBuilder.h:172
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:171
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:2417
BranchInst * CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a conditional 'br Cond, TrueDest, FalseDest' instruction.
Definition: IRBuilder.h:1137
LLVMContext & getContext() const
Definition: IRBuilder.h:173
BranchInst * CreateBr(BasicBlock *Dest)
Create an unconditional 'br label X' instruction.
Definition: IRBuilder.h:1131
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:177
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Definition: IRBuilder.h:1843
Value * CreateAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2152
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2686
Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following:
void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
Definition: Instruction.cpp:78
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
Definition: Instruction.h:363
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:92
const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:70
void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
InstListType::iterator insertInto(BasicBlock *ParentBB, InstListType::iterator It)
Inserts an unlinked instruction into ParentBB at position It and returns the iterator of the inserted...
Class to represent integer types.
Definition: DerivedTypes.h:40
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:48
constexpr unsigned getScalarSizeInBits() const
Definition: LowLevelType.h:267
constexpr bool isScalar() const
Definition: LowLevelType.h:146
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
Definition: LowLevelType.h:42
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
Definition: LowLevelType.h:57
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
Definition: LowLevelType.h:193
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
Definition: LowLevelType.h:221
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
void getSyncScopeNames(SmallVectorImpl< StringRef > &SSNs) const
getSyncScopeNames - Populates client supplied SmallVector with synchronization scope names registered...
An instruction for reading from memory.
Definition: Instructions.h:174
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:259
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
Definition: Instructions.h:239
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
Wrapper class representing physical registers. Should be passed by value.
Definition: MCRegister.h:33
Metadata node.
Definition: Metadata.h:1069
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
Machine Value Type.
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:231
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
void removeSuccessor(MachineBasicBlock *Succ, bool NormalizeSuccProbs=false)
Remove successor from the successors list of this MachineBasicBlock.
MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
Align getAlignment() const
Return alignment of the basic block.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
Definition: MachineInstr.h:69
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:579
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setReg(Register Reg)
Change the register this operand corresponds to.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
bool isInvariant() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
Definition: ModRef.h:198
bool doesNotAccessMemory() const
Whether this function accesses no memory.
Definition: ModRef.h:192
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
Definition: ModRef.h:195
Root of the metadata hierarchy.
Definition: Metadata.h:62
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.h:291
The optimization diagnostic interface.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1852
Register getReg() const
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
Definition: Register.h:84
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:95
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
This class provides iterator support for SDUse operands that use a specific SDNode.
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool isDivergent() const
bool hasOneUse() const
Return true if there is exactly one use of this node.
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
use_iterator use_begin() const
Provide iteration support to walk over all uses of an SDNode.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
static use_iterator use_end()
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction.
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
bool checkForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI, const TargetInstrInfo *TII, unsigned &PhysReg, int &Cost) const override
Allows the target to handle physreg-carried dependency in target-specific way.
EVT getOptimalMemOpType(const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::v5i32 because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
SDValue lowerDYNAMIC_STACKALLOCImpl(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
bool isMemOpUniform(const SDNode *N) const
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
void computeKnownBitsForTargetInstr(GISelKnownBits &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
bool getAddrModeArguments(IntrinsicInst *, SmallVectorImpl< Value * > &, Type *&) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Align computeKnownAlignForTargetInstr(GISelKnownBits &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:226
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:736
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
Definition: SelectionDAG.h:968
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
Definition: SelectionDAG.h:567
bool isKnownNeverSNaN(SDValue Op, unsigned Depth=0) const
SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
const Pass * getPass() const
Definition: SelectionDAG.h:483
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), AAResults *AA=nullptr)
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
Definition: SelectionDAG.h:493
std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
Definition: SelectionDAG.h:842
SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
Definition: SelectionDAG.h:487
SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
SDValue getRegister(unsigned Reg, EVT VT)
void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
const TargetMachine & getTarget() const
Definition: SelectionDAG.h:488
SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, unsigned Reg, SDValue N)
Definition: SelectionDAG.h:787
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond)
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
SDValue getValueType(EVT)
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
bool isKnownNeverNaN(SDValue Op, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:690
unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:482
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, unsigned Reg, EVT VT)
Definition: SelectionDAG.h:813
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
Definition: SelectionDAG.h:859
SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getRegisterMask(const uint32_t *RegMask)
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
SDValue getCondCode(ISD::CondCode Cond)
bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
Definition: SelectionDAG.h:500
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
Definition: SelectionDAG.h:576
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition: SelectionDAG.h:570
std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:367
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:502
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:696
iterator insert(iterator I, T &&Elt)
Definition: SmallVector.h:818
void resize(size_type N)
Definition: SmallVector.h:651
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
An instruction for storing to memory.
Definition: Instructions.h:290
This class is used to represent ISD::STORE nodes.
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
Definition: StringRef.h:838
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
bool starts_with(StringRef Prefix) const
Check if this string starts with the given Prefix.
Definition: StringRef.h:250
constexpr size_t size() const
size - Get the string size.
Definition: StringRef.h:137
constexpr const char * data() const
data - Get a pointer to the start of the string (which may not be null terminated).
Definition: StringRef.h:131
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition: StringRef.h:262
A switch()-like statement whose cases are string literals.
Definition: StringSwitch.h:44
StringSwitch & Case(StringLiteral S, T Value)
Definition: StringSwitch.h:69
R Default(T Value)
Definition: StringSwitch.h:182
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
TargetOptions Options
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
int getCopyCost() const
Return the cost of copying a value between two registers in this class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
OSType getOS() const
Get the parsed operating system type of this triple.
Definition: Triple.h:382
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:345
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
const fltSemantics & getFltSemantics() const
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:153
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition: Type.h:145
bool isSized(SmallPtrSetImpl< Type * > *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
Definition: Type.h:298
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:142
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition: Type.h:156
bool isFunctionTy() const
True if this is an instance of FunctionType.
Definition: Type.h:242
static IntegerType * getInt32Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:224
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:139
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:343
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
void set(Value *Val)
Definition: Value.h:882
const Use & getOperandUse(unsigned i) const
Definition: User.h:182
Value * getOperand(unsigned i) const
Definition: User.h:169
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
bool use_empty() const
Definition: Value.h:344
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1075
iterator_range< use_iterator > uses()
Definition: Value.h:376
void takeName(Value *V)
Transfer the name from V to this value.
Definition: Value.cpp:383
Type * getElementType() const
Definition: DerivedTypes.h:436
constexpr bool isZero() const
Definition: TypeSize.h:156
const ParentTy * getParent() const
Definition: ilist_node.h:32
self_iterator getIterator()
Definition: ilist_node.h:132
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: Lint.cpp:86
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
@ CLAMP
CLAMP value between 0.0 and 1.0.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
Definition: AMDGPU.h:458
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
const uint64_t FltRoundToHWConversionTable
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE bool isKernel(CallingConv::ID CC)
bool isGFX11(const MCSubtargetInfo &STI)
bool isCompute(CallingConv::ID cc)
unsigned getAMDHSACodeObjectVersion(const Module &M)
bool isChainCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool getMUBUFTfe(unsigned Opc)
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
bool isShader(CallingConv::ID cc)
bool isGFX10Plus(const MCSubtargetInfo &STI)
LLVM_READONLY int getVOPe64(uint16_t Opcode)
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds)
Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
bool isExtendedGlobalAddrSpace(unsigned AS)
Definition: AMDGPU.h:465
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
bool isGraphics(CallingConv::ID cc)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
Definition: CallingConv.h:197
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ MaxID
The highest possible ID. Must be some 2^k - 1.
Definition: CallingConv.h:274
@ AMDGPU_Gfx
Used for AMD graphics targets.
Definition: CallingConv.h:232
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:249
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:245
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:194
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:779
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition: ISDOpcodes.h:243
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
Definition: ISDOpcodes.h:1190
@ CTLZ_ZERO_UNDEF
Definition: ISDOpcodes.h:752
@ ATOMIC_LOAD_FMAX
Definition: ISDOpcodes.h:1344
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition: ISDOpcodes.h:44
@ SET_FPENV
Sets the current floating-point environment.
Definition: ISDOpcodes.h:1066
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:257
@ ATOMIC_LOAD_NAND
Definition: ISDOpcodes.h:1337
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition: ISDOpcodes.h:573
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:743
@ ConstantFP
Definition: ISDOpcodes.h:77
@ ATOMIC_LOAD_MAX
Definition: ISDOpcodes.h:1339
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, ptr, val) This corresponds to "store atomic" instruction.
Definition: ISDOpcodes.h:1309
@ ATOMIC_LOAD_UMIN
Definition: ISDOpcodes.h:1340
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition: ISDOpcodes.h:501
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:246
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1099
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition: ISDOpcodes.h:813
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:497
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition: ISDOpcodes.h:205
@ GlobalAddress
Definition: ISDOpcodes.h:78
@ ATOMIC_CMP_SWAP_WITH_SUCCESS
Val, Success, OUTCHAIN = ATOMIC_CMP_SWAP_WITH_SUCCESS(INCHAIN, ptr, cmp, swap) N.b.
Definition: ISDOpcodes.h:1322
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:840
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition: ISDOpcodes.h:557
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:397
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition: ISDOpcodes.h:716
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
Definition: ISDOpcodes.h:963
@ FPTRUNC_ROUND
FPTRUNC_ROUND - This corresponds to the fptrunc_round intrinsic.
Definition: ISDOpcodes.h:494
@ ATOMIC_LOAD_OR
Definition: ISDOpcodes.h:1335
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:953
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition: ISDOpcodes.h:236
@ ATOMIC_LOAD_XOR
Definition: ISDOpcodes.h:1336
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
Definition: ISDOpcodes.h:996
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
Definition: ISDOpcodes.h:1480
@ ATOMIC_LOAD_FADD
Definition: ISDOpcodes.h:1342
@ SET_ROUNDING
Set rounding mode.
Definition: ISDOpcodes.h:935
@ CONVERGENCECTRL_GLUE
Definition: ISDOpcodes.h:1466
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:804
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition: ISDOpcodes.h:634
@ READSTEADYCOUNTER
READSTEADYCOUNTER - This corresponds to the readfixedcounter intrinsic.
Definition: ISDOpcodes.h:1256
@ BR
Control flow instructions. These all have token chains.
Definition: ISDOpcodes.h:1115
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition: ISDOpcodes.h:751
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
Definition: ISDOpcodes.h:1289
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
Definition: ISDOpcodes.h:1056
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:980
@ BR_CC
BR_CC - Conditional branch.
Definition: ISDOpcodes.h:1145
@ ATOMIC_LOAD_MIN
Definition: ISDOpcodes.h:1338
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition: ISDOpcodes.h:514
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
Definition: ISDOpcodes.h:521
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition: ISDOpcodes.h:356
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:756
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
Definition: ISDOpcodes.h:1305
@ UNDEF
UNDEF - An undefined node.
Definition: ISDOpcodes.h:218
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition: ISDOpcodes.h:229
@ ATOMIC_LOAD_FMIN
Definition: ISDOpcodes.h:1345
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition: ISDOpcodes.h:215
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition: ISDOpcodes.h:930
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:673
@ GET_FPMODE
Reads the current dynamic floating-point control modes.
Definition: ISDOpcodes.h:1084
@ GET_FPENV
Gets the current floating-point environment.
Definition: ISDOpcodes.h:1061
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:734
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:614
@ ATOMIC_LOAD_AND
Definition: ISDOpcodes.h:1333
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition: ISDOpcodes.h:587
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
Definition: ISDOpcodes.h:1041
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:549
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:209
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:810
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
Definition: ISDOpcodes.h:1279
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition: ISDOpcodes.h:771
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
Definition: ISDOpcodes.h:1316
@ ATOMIC_LOAD_UMAX
Definition: ISDOpcodes.h:1341
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:1028
@ SMULO
Same for multiplication.
Definition: ISDOpcodes.h:338
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
Definition: ISDOpcodes.h:1109
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:848
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:696
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:938
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:310
@ INLINEASM_BR
INLINEASM_BR - Branching version of inline asm. Used by asm-goto.
Definition: ISDOpcodes.h:1165
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
Definition: ISDOpcodes.h:972
@ ATOMIC_LOAD_UDEC_WRAP
Definition: ISDOpcodes.h:1347
@ ATOMIC_LOAD_ADD
Definition: ISDOpcodes.h:1331
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition: ISDOpcodes.h:479
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:1047
@ ATOMIC_LOAD_SUB
Definition: ISDOpcodes.h:1332
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:886
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
Definition: ISDOpcodes.h:1250
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:484
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:708
@ TRAP
TRAP - Trapping instruction.
Definition: ISDOpcodes.h:1276
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:190
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition: ISDOpcodes.h:538
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition: ISDOpcodes.h:52
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
Definition: ISDOpcodes.h:1330
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
Definition: ISDOpcodes.h:1001
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:919
@ STRICT_FLDEXP
Definition: ISDOpcodes.h:421
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
Definition: ISDOpcodes.h:957
@ INLINEASM
INLINEASM - Represents an inline asm block.
Definition: ISDOpcodes.h:1162
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:816
@ BRCOND
BRCOND - Conditional branch.
Definition: ISDOpcodes.h:1138
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition: ISDOpcodes.h:793
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition: ISDOpcodes.h:61
@ ATOMIC_LOAD_UINC_WRAP
Definition: ISDOpcodes.h:1346
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition: ISDOpcodes.h:507
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition: ISDOpcodes.h:347
@ AssertZext
Definition: ISDOpcodes.h:62
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:198
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition: ISDOpcodes.h:529
CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1603
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
Definition: ISDOpcodes.h:1583
StringRef getName(ID id)
Return the LLVM name for an intrinsic, such as "llvm.ppc.altivec.lvx".
Definition: Function.cpp:1096
AttributeList getAttributes(LLVMContext &C, ID id)
Return the attributes for an intrinsic.
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Offsets
Offsets in bytes from the start of the input buffer.
Definition: SIInstrInfo.h:1589
@ System
Synchronized with respect to all concurrently executing threads.
Definition: LLVMContext.h:57
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
constexpr double inv_pi
Definition: MathExtras.h:54
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
@ Offset
Definition: DWP.cpp:480
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
Definition: Analysis.cpp:233
void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
Definition: MathExtras.h:244
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
@ Done
Definition: Threading.h:61
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition: Error.h:198
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition: bit.h:317
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2098
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:555
ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:394
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition: MathExtras.h:285
bool isReleaseOrStronger(AtomicOrdering AO)
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition: SIInstrInfo.h:41
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:340
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition: bit.h:281
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:291
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:154
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
Definition: Analysis.cpp:199
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:159
Value * buildAtomicRMWValue(AtomicRMWInst::BinOp Op, IRBuilderBase &Builder, Value *Loaded, Value *Val)
Emit IR to implement the given atomicrmw operation on values in registers, returning the new value.
Definition: LowerAtomic.cpp:42
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:403
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
unsigned getUndefRegState(bool B)
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
@ AfterLegalizeDAG
Definition: DAGCombine.h:19
@ AfterLegalizeVectorOps
Definition: DAGCombine.h:18
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ Add
Sum of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
DWARFExpression::Operation Op
@ TowardZero
roundTowardZero.
@ NearestTiesToEven
roundTiesToEven.
@ TowardPositive
roundTowardPositive.
@ TowardNegative
roundTowardNegative.
unsigned M0(unsigned Val)
Definition: VE.h:375
int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
Definition: MathExtras.h:235
ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:191
@ DS_Warning
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1749
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition: SIInstrInfo.h:45
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1886
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
SDValue SrcOp
int64_t DWordOffset
int64_t PermMask
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
static constexpr uint64_t encode(Fields... Values)
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
static constexpr roundingMode rmNearestTiesToEven
Definition: APFloat.h:254
static const fltSemantics & IEEEhalf() LLVM_READNONE
Definition: APFloat.cpp:279
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Definition: SCCPSolver.h:41
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
Extended Value Type.
Definition: ValueTypes.h:35
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition: ValueTypes.h:381
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition: ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition: ValueTypes.h:121
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition: ValueTypes.h:291
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:147
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:359
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition: ValueTypes.h:234
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:371
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition: ValueTypes.h:456
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition: ValueTypes.h:398
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:307
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition: ValueTypes.h:65
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:314
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition: ValueTypes.h:247
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:204
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:319
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:157
const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
Definition: ValueTypes.cpp:306
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:327
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition: ValueTypes.h:152
unsigned getPointerAddrSpace() const
unsigned getByValSize() const
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
unsigned getOrigArgIndex() const
bool isUnknown() const
Returns true if we don't know any bits.
Definition: KnownBits.h:62
void resetAll()
Resets the known state of all bits.
Definition: KnownBits.h:70
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition: KnownBits.h:333
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition: KnownBits.h:237
This class contains a discriminated union of information about pointers in memory operands,...
static MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.
static MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
unsigned int NumVTs
bool DX10Clamp
Used by the vector ALU to force DX10-style treatment of NaNs: when set, clamp NaN to zero; otherwise,...
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs
SmallVector< SDValue, 32 > OutVals