LLVM 19.0.0git
SIISelLowering.cpp
Go to the documentation of this file.
1//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Custom DAG lowering for SI
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIISelLowering.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPUTargetMachine.h"
18#include "GCNSubtarget.h"
21#include "SIRegisterInfo.h"
22#include "llvm/ADT/APInt.h"
24#include "llvm/ADT/Statistic.h"
38#include "llvm/IR/IRBuilder.h"
40#include "llvm/IR/IntrinsicsAMDGPU.h"
41#include "llvm/IR/IntrinsicsR600.h"
44#include "llvm/Support/ModRef.h"
45#include <optional>
46
47using namespace llvm;
48
49#define DEBUG_TYPE "si-lower"
50
51STATISTIC(NumTailCalls, "Number of tail calls");
52
54 "amdgpu-disable-loop-alignment",
55 cl::desc("Do not align and prefetch loops"),
56 cl::init(false));
57
59 "amdgpu-use-divergent-register-indexing",
61 cl::desc("Use indirect register addressing for divergent indexes"),
62 cl::init(false));
63
66 return Info->getMode().FP32Denormals == DenormalMode::getPreserveSign();
67}
68
71 return Info->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign();
72}
73
74static unsigned findFirstFreeSGPR(CCState &CCInfo) {
75 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
76 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
77 if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
78 return AMDGPU::SGPR0 + Reg;
79 }
80 }
81 llvm_unreachable("Cannot allocate sgpr");
82}
83
85 const GCNSubtarget &STI)
87 Subtarget(&STI) {
88 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
89 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
90
91 addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
92 addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
93
94 addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
95
96 const SIRegisterInfo *TRI = STI.getRegisterInfo();
97 const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class();
98
99 addRegisterClass(MVT::f64, V64RegClass);
100 addRegisterClass(MVT::v2f32, V64RegClass);
101 addRegisterClass(MVT::Untyped, V64RegClass);
102
103 addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
104 addRegisterClass(MVT::v3f32, TRI->getVGPRClassForBitWidth(96));
105
106 addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
107 addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
108
109 addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
110 addRegisterClass(MVT::v4f32, TRI->getVGPRClassForBitWidth(128));
111
112 addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
113 addRegisterClass(MVT::v5f32, TRI->getVGPRClassForBitWidth(160));
114
115 addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass);
116 addRegisterClass(MVT::v6f32, TRI->getVGPRClassForBitWidth(192));
117
118 addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass);
119 addRegisterClass(MVT::v3f64, TRI->getVGPRClassForBitWidth(192));
120
121 addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass);
122 addRegisterClass(MVT::v7f32, TRI->getVGPRClassForBitWidth(224));
123
124 addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
125 addRegisterClass(MVT::v8f32, TRI->getVGPRClassForBitWidth(256));
126
127 addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
128 addRegisterClass(MVT::v4f64, TRI->getVGPRClassForBitWidth(256));
129
130 addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass);
131 addRegisterClass(MVT::v9f32, TRI->getVGPRClassForBitWidth(288));
132
133 addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass);
134 addRegisterClass(MVT::v10f32, TRI->getVGPRClassForBitWidth(320));
135
136 addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass);
137 addRegisterClass(MVT::v11f32, TRI->getVGPRClassForBitWidth(352));
138
139 addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass);
140 addRegisterClass(MVT::v12f32, TRI->getVGPRClassForBitWidth(384));
141
142 addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
143 addRegisterClass(MVT::v16f32, TRI->getVGPRClassForBitWidth(512));
144
145 addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);
146 addRegisterClass(MVT::v8f64, TRI->getVGPRClassForBitWidth(512));
147
148 addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);
149 addRegisterClass(MVT::v16f64, TRI->getVGPRClassForBitWidth(1024));
150
151 if (Subtarget->has16BitInsts()) {
152 if (Subtarget->useRealTrue16Insts()) {
153 addRegisterClass(MVT::i16, &AMDGPU::VGPR_16RegClass);
154 addRegisterClass(MVT::f16, &AMDGPU::VGPR_16RegClass);
155 addRegisterClass(MVT::bf16, &AMDGPU::VGPR_16RegClass);
156 } else {
157 addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
158 addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
159 addRegisterClass(MVT::bf16, &AMDGPU::SReg_32RegClass);
160 }
161
162 // Unless there are also VOP3P operations, not operations are really legal.
163 addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass);
164 addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass);
165 addRegisterClass(MVT::v2bf16, &AMDGPU::SReg_32RegClass);
166 addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
167 addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
168 addRegisterClass(MVT::v4bf16, &AMDGPU::SReg_64RegClass);
169 addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass);
170 addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass);
171 addRegisterClass(MVT::v8bf16, &AMDGPU::SGPR_128RegClass);
172 addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass);
173 addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass);
174 addRegisterClass(MVT::v16bf16, &AMDGPU::SGPR_256RegClass);
175 addRegisterClass(MVT::v32i16, &AMDGPU::SGPR_512RegClass);
176 addRegisterClass(MVT::v32f16, &AMDGPU::SGPR_512RegClass);
177 addRegisterClass(MVT::v32bf16, &AMDGPU::SGPR_512RegClass);
178 }
179
180 addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
181 addRegisterClass(MVT::v32f32, TRI->getVGPRClassForBitWidth(1024));
182
184
185 // The boolean content concept here is too inflexible. Compares only ever
186 // really produce a 1-bit result. Any copy/extend from these will turn into a
187 // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as
188 // it's what most targets use.
191
192 // We need to custom lower vector stores from local memory
194 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
195 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
196 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
197 MVT::i1, MVT::v32i32},
198 Custom);
199
201 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
202 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
203 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
204 MVT::i1, MVT::v32i32},
205 Custom);
206
207 if (isTypeLegal(MVT::bf16)) {
208 for (unsigned Opc :
217 ISD::SETCC}) {
218 // FIXME: The promoted to type shouldn't need to be explicit
219 setOperationAction(Opc, MVT::bf16, Promote);
220 AddPromotedToType(Opc, MVT::bf16, MVT::f32);
221 }
222
224
226 AddPromotedToType(ISD::SELECT, MVT::bf16, MVT::i16);
227
228 // TODO: Could make these legal
232
233 // We only need to custom lower because we can't specify an action for bf16
234 // sources.
237
239 AddPromotedToType(ISD::BUILD_VECTOR, MVT::v2bf16, MVT::v2i16);
240 }
241
242 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
243 setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand);
244 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
245 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
246 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
247 setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
248 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
249 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
250 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
251 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
252 setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
253 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
254 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand);
255 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand);
256 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand);
257 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand);
258
259 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
260 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
261 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand);
262 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand);
263 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand);
264 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Expand);
265 setTruncStoreAction(MVT::v16i64, MVT::v16i32, Expand);
266
267 setOperationAction(ISD::GlobalAddress, {MVT::i32, MVT::i64}, Custom);
268
272 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
273
274 setOperationAction(ISD::FSQRT, {MVT::f32, MVT::f64}, Custom);
275
277 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand);
278
280 setOperationAction(ISD::SETCC, {MVT::v2i1, MVT::v4i1}, Expand);
281 AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
282
284 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
285 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
286 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
287 Expand);
289 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
290 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
291 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
292 Expand);
293
295 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
296 MVT::v3i16, MVT::v4i16, MVT::Other},
297 Custom);
298
301 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Expand);
302
304
306
308 Expand);
309
310#if 0
312#endif
313
314 // We only support LOAD/STORE and vector manipulation ops for vectors
315 // with > 4 elements.
316 for (MVT VT :
317 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
318 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
319 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
320 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
321 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
322 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
323 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
324 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
325 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
326 switch (Op) {
327 case ISD::LOAD:
328 case ISD::STORE:
330 case ISD::BITCAST:
331 case ISD::UNDEF:
335 case ISD::IS_FPCLASS:
336 break;
341 break;
342 default:
344 break;
345 }
346 }
347 }
348
350
351 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
352 // is expanded to avoid having two separate loops in case the index is a VGPR.
353
354 // Most operations are naturally 32-bit vector operations. We only support
355 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
356 for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) {
358 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
359
361 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
362
364 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
365
367 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
368 }
369
370 for (MVT Vec64 : { MVT::v3i64, MVT::v3f64 }) {
372 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32);
373
375 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v6i32);
376
378 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v6i32);
379
381 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v6i32);
382 }
383
384 for (MVT Vec64 : { MVT::v4i64, MVT::v4f64 }) {
386 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32);
387
389 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v8i32);
390
392 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v8i32);
393
395 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v8i32);
396 }
397
398 for (MVT Vec64 : { MVT::v8i64, MVT::v8f64 }) {
400 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v16i32);
401
403 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v16i32);
404
406 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v16i32);
407
409 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v16i32);
410 }
411
412 for (MVT Vec64 : { MVT::v16i64, MVT::v16f64 }) {
414 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v32i32);
415
417 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v32i32);
418
420 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v32i32);
421
423 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v32i32);
424 }
425
427 {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32},
428 Expand);
429
430 setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
431 Custom);
432
433 // Avoid stack access for these.
434 // TODO: Generalize to more vector types.
436 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
437 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
438 Custom);
439
440 // Deal with vec3 vector operations when widened to vec4.
442 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32}, Custom);
443
444 // Deal with vec5/6/7 vector operations when widened to vec8.
446 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
447 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
448 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
449 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
450 Custom);
451
452 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
453 // and output demarshalling
454 setOperationAction(ISD::ATOMIC_CMP_SWAP, {MVT::i32, MVT::i64}, Custom);
455
456 // We can't return success/failure, only the old value,
457 // let LLVM add the comparison
459 Expand);
460
461 setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);
462
463 setOperationAction(ISD::BITREVERSE, {MVT::i32, MVT::i64}, Legal);
464
465 // FIXME: This should be narrowed to i32, but that only happens if i64 is
466 // illegal.
467 // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
468 setOperationAction(ISD::BSWAP, {MVT::i64, MVT::i32}, Legal);
469
470 // On SI this is s_memtime and s_memrealtime on VI.
472
473 if (Subtarget->hasSMemRealTime() ||
477
478 if (Subtarget->has16BitInsts()) {
481 } else {
483 }
484
485 if (Subtarget->hasMadMacF32Insts())
487
488 if (!Subtarget->hasBFI())
489 // fcopysign can be done in a single instruction with BFI.
490 setOperationAction(ISD::FCOPYSIGN, {MVT::f32, MVT::f64}, Expand);
491
492 if (!Subtarget->hasBCNT(32))
494
495 if (!Subtarget->hasBCNT(64))
497
498 if (Subtarget->hasFFBH())
500
501 if (Subtarget->hasFFBL())
503
504 // We only really have 32-bit BFE instructions (and 16-bit on VI).
505 //
506 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
507 // effort to match them now. We want this to be false for i64 cases when the
508 // extraction isn't restricted to the upper or lower half. Ideally we would
509 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
510 // span the midpoint are probably relatively rare, so don't worry about them
511 // for now.
512 if (Subtarget->hasBFE())
514
515 // Clamp modifier on add/sub
516 if (Subtarget->hasIntClamp())
518
519 if (Subtarget->hasAddNoCarry())
520 setOperationAction({ISD::SADDSAT, ISD::SSUBSAT}, {MVT::i16, MVT::i32},
521 Legal);
522
523 setOperationAction({ISD::FMINNUM, ISD::FMAXNUM}, {MVT::f32, MVT::f64},
524 Custom);
525
526 // These are really only legal for ieee_mode functions. We should be avoiding
527 // them for functions that don't have ieee_mode enabled, so just say they are
528 // legal.
530 {MVT::f32, MVT::f64}, Legal);
531
532 if (Subtarget->haveRoundOpsF64())
534 Legal);
535 else
537 MVT::f64, Custom);
538
540 setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, {MVT::f32, MVT::f64},
541 Legal);
542 setOperationAction(ISD::FFREXP, {MVT::f32, MVT::f64}, Custom);
543
546
547 setOperationAction(ISD::BF16_TO_FP, {MVT::i16, MVT::f32, MVT::f64}, Expand);
548 setOperationAction(ISD::FP_TO_BF16, {MVT::i16, MVT::f32, MVT::f64}, Expand);
549
550 // Custom lower these because we can't specify a rule based on an illegal
551 // source bf16.
554
555 if (Subtarget->has16BitInsts()) {
558 MVT::i16, Legal);
559
560 AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
561
563 MVT::i16, Expand);
564
568 ISD::CTPOP},
569 MVT::i16, Promote);
570
572
573 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
574
576 AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
578 AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
579
583
585
586 // F16 - Constant Actions.
589
590 // F16 - Load/Store Actions.
592 AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
594 AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
595
596 // BF16 - Load/Store Actions.
598 AddPromotedToType(ISD::LOAD, MVT::bf16, MVT::i16);
600 AddPromotedToType(ISD::STORE, MVT::bf16, MVT::i16);
601
602 // F16 - VOP1 Actions.
605 MVT::f16, Custom);
606
609
610 // F16 - VOP2 Actions.
611 setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, {MVT::f16, MVT::bf16},
612 Expand);
616
617 // F16 - VOP3 Actions.
619 if (STI.hasMadF16())
621
622 for (MVT VT :
623 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
624 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
625 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
626 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
627 switch (Op) {
628 case ISD::LOAD:
629 case ISD::STORE:
631 case ISD::BITCAST:
632 case ISD::UNDEF:
638 case ISD::IS_FPCLASS:
639 break;
642 break;
643 default:
645 break;
646 }
647 }
648 }
649
650 // v_perm_b32 can handle either of these.
651 setOperationAction(ISD::BSWAP, {MVT::i16, MVT::v2i16}, Legal);
653
654 // XXX - Do these do anything? Vector constants turn into build_vector.
655 setOperationAction(ISD::Constant, {MVT::v2i16, MVT::v2f16}, Legal);
656
657 setOperationAction(ISD::UNDEF, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
658 Legal);
659
661 AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
663 AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
664
666 AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
668 AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
669
670 setOperationAction(ISD::AND, MVT::v2i16, Promote);
671 AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
672 setOperationAction(ISD::OR, MVT::v2i16, Promote);
673 AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
674 setOperationAction(ISD::XOR, MVT::v2i16, Promote);
675 AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
676
678 AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
680 AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
681 setOperationAction(ISD::LOAD, MVT::v4bf16, Promote);
682 AddPromotedToType(ISD::LOAD, MVT::v4bf16, MVT::v2i32);
683
685 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
687 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
689 AddPromotedToType(ISD::STORE, MVT::v4bf16, MVT::v2i32);
690
692 AddPromotedToType(ISD::LOAD, MVT::v8i16, MVT::v4i32);
694 AddPromotedToType(ISD::LOAD, MVT::v8f16, MVT::v4i32);
695 setOperationAction(ISD::LOAD, MVT::v8bf16, Promote);
696 AddPromotedToType(ISD::LOAD, MVT::v8bf16, MVT::v4i32);
697
699 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
701 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
702
704 AddPromotedToType(ISD::STORE, MVT::v8i16, MVT::v4i32);
706 AddPromotedToType(ISD::STORE, MVT::v8f16, MVT::v4i32);
708 AddPromotedToType(ISD::STORE, MVT::v8bf16, MVT::v4i32);
709
710 setOperationAction(ISD::LOAD, MVT::v16i16, Promote);
711 AddPromotedToType(ISD::LOAD, MVT::v16i16, MVT::v8i32);
712 setOperationAction(ISD::LOAD, MVT::v16f16, Promote);
713 AddPromotedToType(ISD::LOAD, MVT::v16f16, MVT::v8i32);
714 setOperationAction(ISD::LOAD, MVT::v16bf16, Promote);
715 AddPromotedToType(ISD::LOAD, MVT::v16bf16, MVT::v8i32);
716
718 AddPromotedToType(ISD::STORE, MVT::v16i16, MVT::v8i32);
720 AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32);
721 setOperationAction(ISD::STORE, MVT::v16bf16, Promote);
722 AddPromotedToType(ISD::STORE, MVT::v16bf16, MVT::v8i32);
723
724 setOperationAction(ISD::LOAD, MVT::v32i16, Promote);
725 AddPromotedToType(ISD::LOAD, MVT::v32i16, MVT::v16i32);
726 setOperationAction(ISD::LOAD, MVT::v32f16, Promote);
727 AddPromotedToType(ISD::LOAD, MVT::v32f16, MVT::v16i32);
728 setOperationAction(ISD::LOAD, MVT::v32bf16, Promote);
729 AddPromotedToType(ISD::LOAD, MVT::v32bf16, MVT::v16i32);
730
732 AddPromotedToType(ISD::STORE, MVT::v32i16, MVT::v16i32);
734 AddPromotedToType(ISD::STORE, MVT::v32f16, MVT::v16i32);
735 setOperationAction(ISD::STORE, MVT::v32bf16, Promote);
736 AddPromotedToType(ISD::STORE, MVT::v32bf16, MVT::v16i32);
737
739 MVT::v2i32, Expand);
741
743 MVT::v4i32, Expand);
744
746 MVT::v8i32, Expand);
747
748 if (!Subtarget->hasVOP3PInsts())
750 {MVT::v2i16, MVT::v2f16, MVT::v2bf16}, Custom);
751
752 setOperationAction(ISD::FNEG, MVT::v2f16, Legal);
753 // This isn't really legal, but this avoids the legalizer unrolling it (and
754 // allows matching fneg (fabs x) patterns)
755 setOperationAction(ISD::FABS, MVT::v2f16, Legal);
756
759
761 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
762 Custom);
763
765 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
766 Expand);
767
768 for (MVT Vec16 :
769 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
770 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
773 Vec16, Custom);
775 }
776 }
777
778 if (Subtarget->hasVOP3PInsts()) {
782 MVT::v2i16, Legal);
783
786 MVT::v2f16, Legal);
787
788 setOperationAction(ISD::EXTRACT_VECTOR_ELT, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
789 Custom);
790
792 {MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::v8i16,
793 MVT::v16f16, MVT::v16i16, MVT::v32f16, MVT::v32i16},
794 Custom);
795
796 for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
797 // Split vector operations.
802 VT, Custom);
803
804 for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
805 // Split vector operations.
807 VT, Custom);
808
809 setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, {MVT::v2f16, MVT::v4f16},
810 Custom);
811
812 setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
813 setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16, MVT::v4bf16},
814 Custom);
815
816 if (Subtarget->hasPackedFP32Ops()) {
818 MVT::v2f32, Legal);
820 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
821 Custom);
822 }
823 }
824
826
827 if (Subtarget->has16BitInsts()) {
829 AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
831 AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
832 } else {
833 // Legalization hack.
834 setOperationAction(ISD::SELECT, {MVT::v2i16, MVT::v2f16}, Custom);
835
837 }
838
840 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
841 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
842 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
843 MVT::v32f16, MVT::v32bf16},
844 Custom);
845
847
848 if (Subtarget->hasScalarSMulU64())
850
851 if (Subtarget->hasMad64_32())
853
854 if (Subtarget->hasPrefetch())
856
857 if (Subtarget->hasIEEEMinMax())
859 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Legal);
860
862 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
863 MVT::v2i16, MVT::v2f16, MVT::i128, MVT::i8},
864 Custom);
865
867 {MVT::v2f16, MVT::v2i16, MVT::v3f16, MVT::v3i16,
868 MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::Other, MVT::f16,
869 MVT::i16, MVT::i8, MVT::i128},
870 Custom);
871
873 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v3i16,
874 MVT::v3f16, MVT::v4f16, MVT::v4i16, MVT::f16, MVT::i16,
875 MVT::i8, MVT::i128},
876 Custom);
877
882
883 // TODO: Could move this to custom lowering, could benefit from combines on
884 // extract of relevant bits.
886
888
891 ISD::SUB,
893 ISD::FADD,
894 ISD::FSUB,
895 ISD::FDIV,
902 ISD::FMA,
903 ISD::SMIN,
904 ISD::SMAX,
905 ISD::UMIN,
906 ISD::UMAX,
908 ISD::AND,
909 ISD::OR,
910 ISD::XOR,
911 ISD::FSHR,
921
922 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
924
925 // All memory operations. Some folding on the pointer operand is done to help
926 // matching the constant offsets in the addressing modes.
949
950 // FIXME: In other contexts we pretend this is a per-function property.
952
954}
955
957 return Subtarget;
958}
959
960//===----------------------------------------------------------------------===//
961// TargetLowering queries
962//===----------------------------------------------------------------------===//
963
964// v_mad_mix* support a conversion from f16 to f32.
965//
966// There is only one special case when denormals are enabled we don't currently,
967// where this is OK to use.
968bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
969 EVT DestVT, EVT SrcVT) const {
970 return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
971 (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
972 DestVT.getScalarType() == MVT::f32 &&
973 SrcVT.getScalarType() == MVT::f16 &&
974 // TODO: This probably only requires no input flushing?
976}
977
979 LLT DestTy, LLT SrcTy) const {
980 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
981 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
982 DestTy.getScalarSizeInBits() == 32 &&
983 SrcTy.getScalarSizeInBits() == 16 &&
984 // TODO: This probably only requires no input flushing?
986}
987
989 // SI has some legal vector types, but no legal vector operations. Say no
990 // shuffles are legal in order to prefer scalarizing some vector operations.
991 return false;
992}
993
996 EVT VT) const {
999
1000 if (VT.isVector()) {
1001 EVT ScalarVT = VT.getScalarType();
1002 unsigned Size = ScalarVT.getSizeInBits();
1003 if (Size == 16) {
1004 if (Subtarget->has16BitInsts()) {
1005 if (VT.isInteger())
1006 return MVT::v2i16;
1007 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1008 }
1009 return VT.isInteger() ? MVT::i32 : MVT::f32;
1010 }
1011
1012 if (Size < 16)
1013 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1014 return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
1015 }
1016
1017 if (VT.getSizeInBits() > 32)
1018 return MVT::i32;
1019
1021}
1022
1025 EVT VT) const {
1028
1029 if (VT.isVector()) {
1030 unsigned NumElts = VT.getVectorNumElements();
1031 EVT ScalarVT = VT.getScalarType();
1032 unsigned Size = ScalarVT.getSizeInBits();
1033
1034 // FIXME: Should probably promote 8-bit vectors to i16.
1035 if (Size == 16 && Subtarget->has16BitInsts())
1036 return (NumElts + 1) / 2;
1037
1038 if (Size <= 32)
1039 return NumElts;
1040
1041 if (Size > 32)
1042 return NumElts * ((Size + 31) / 32);
1043 } else if (VT.getSizeInBits() > 32)
1044 return (VT.getSizeInBits() + 31) / 32;
1045
1047}
1048
1050 LLVMContext &Context, CallingConv::ID CC,
1051 EVT VT, EVT &IntermediateVT,
1052 unsigned &NumIntermediates, MVT &RegisterVT) const {
1053 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
1054 unsigned NumElts = VT.getVectorNumElements();
1055 EVT ScalarVT = VT.getScalarType();
1056 unsigned Size = ScalarVT.getSizeInBits();
1057 // FIXME: We should fix the ABI to be the same on targets without 16-bit
1058 // support, but unless we can properly handle 3-vectors, it will be still be
1059 // inconsistent.
1060 if (Size == 16 && Subtarget->has16BitInsts()) {
1061 if (ScalarVT == MVT::bf16) {
1062 RegisterVT = MVT::i32;
1063 IntermediateVT = MVT::v2bf16;
1064 } else {
1065 RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
1066 IntermediateVT = RegisterVT;
1067 }
1068 NumIntermediates = (NumElts + 1) / 2;
1069 return NumIntermediates;
1070 }
1071
1072 if (Size == 32) {
1073 RegisterVT = ScalarVT.getSimpleVT();
1074 IntermediateVT = RegisterVT;
1075 NumIntermediates = NumElts;
1076 return NumIntermediates;
1077 }
1078
1079 if (Size < 16 && Subtarget->has16BitInsts()) {
1080 // FIXME: Should probably form v2i16 pieces
1081 RegisterVT = MVT::i16;
1082 IntermediateVT = ScalarVT;
1083 NumIntermediates = NumElts;
1084 return NumIntermediates;
1085 }
1086
1087
1088 if (Size != 16 && Size <= 32) {
1089 RegisterVT = MVT::i32;
1090 IntermediateVT = ScalarVT;
1091 NumIntermediates = NumElts;
1092 return NumIntermediates;
1093 }
1094
1095 if (Size > 32) {
1096 RegisterVT = MVT::i32;
1097 IntermediateVT = RegisterVT;
1098 NumIntermediates = NumElts * ((Size + 31) / 32);
1099 return NumIntermediates;
1100 }
1101 }
1102
1104 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1105}
1106
1107static EVT memVTFromLoadIntrData(Type *Ty, unsigned MaxNumLanes) {
1108 assert(MaxNumLanes != 0);
1109
1110 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1111 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1112 return EVT::getVectorVT(Ty->getContext(),
1113 EVT::getEVT(VT->getElementType()),
1114 NumElts);
1115 }
1116
1117 return EVT::getEVT(Ty);
1118}
1119
1120// Peek through TFE struct returns to only use the data size.
1121static EVT memVTFromLoadIntrReturn(Type *Ty, unsigned MaxNumLanes) {
1122 auto *ST = dyn_cast<StructType>(Ty);
1123 if (!ST)
1124 return memVTFromLoadIntrData(Ty, MaxNumLanes);
1125
1126 // TFE intrinsics return an aggregate type.
1127 assert(ST->getNumContainedTypes() == 2 &&
1128 ST->getContainedType(1)->isIntegerTy(32));
1129 return memVTFromLoadIntrData(ST->getContainedType(0), MaxNumLanes);
1130}
1131
1132/// Map address space 7 to MVT::v5i32 because that's its in-memory
1133/// representation. This return value is vector-typed because there is no
1134/// MVT::i160 and it is not clear if one can be added. While this could
1135/// cause issues during codegen, these address space 7 pointers will be
1136/// rewritten away by then. Therefore, we can return MVT::v5i32 in order
1137/// to allow pre-codegen passes that query TargetTransformInfo, often for cost
1138/// modeling, to work.
1140 if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160)
1141 return MVT::v5i32;
1143 DL.getPointerSizeInBits(AS) == 192)
1144 return MVT::v6i32;
1146}
1147/// Similarly, the in-memory representation of a p7 is {p8, i32}, aka
1148/// v8i32 when padding is added.
1149/// The in-memory representation of a p9 is {p8, i32, i32}, which is
1150/// also v8i32 with padding.
1152 if ((AMDGPUAS::BUFFER_FAT_POINTER == AS &&
1153 DL.getPointerSizeInBits(AS) == 160) ||
1155 DL.getPointerSizeInBits(AS) == 192))
1156 return MVT::v8i32;
1158}
1159
1161 const CallInst &CI,
1162 MachineFunction &MF,
1163 unsigned IntrID) const {
1165 if (CI.hasMetadata(LLVMContext::MD_invariant_load))
1167
1168 if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
1171 (Intrinsic::ID)IntrID);
1172 MemoryEffects ME = Attr.getMemoryEffects();
1173 if (ME.doesNotAccessMemory())
1174 return false;
1175
1176 // TODO: Should images get their own address space?
1177 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1178
1179 if (RsrcIntr->IsImage)
1180 Info.align.reset();
1181
1182 Value *RsrcArg = CI.getArgOperand(RsrcIntr->RsrcArg);
1183 if (auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->getType())) {
1184 if (RsrcPtrTy->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
1185 // We conservatively set the memory operand of a buffer intrinsic to the
1186 // base resource pointer, so that we can access alias information about
1187 // those pointers. Cases like "this points at the same value
1188 // but with a different offset" are handled in
1189 // areMemAccessesTriviallyDisjoint.
1190 Info.ptrVal = RsrcArg;
1191 }
1192
1193 auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
1194 if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
1197 if (ME.onlyReadsMemory()) {
1198 unsigned MaxNumLanes = 4;
1199
1200 if (RsrcIntr->IsImage) {
1203 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
1205
1206 if (!BaseOpcode->Gather4) {
1207 // If this isn't a gather, we may have excess loaded elements in the
1208 // IR type. Check the dmask for the real number of elements loaded.
1209 unsigned DMask
1210 = cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue();
1211 MaxNumLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1212 }
1213 }
1214
1215 Info.memVT = memVTFromLoadIntrReturn(CI.getType(), MaxNumLanes);
1216
1217 // FIXME: What does alignment mean for an image?
1220 } else if (ME.onlyWritesMemory()) {
1222
1223 Type *DataTy = CI.getArgOperand(0)->getType();
1224 if (RsrcIntr->IsImage) {
1225 unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue();
1226 unsigned DMaskLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1227 Info.memVT = memVTFromLoadIntrData(DataTy, DMaskLanes);
1228 } else
1229 Info.memVT = EVT::getEVT(DataTy);
1230
1232 } else {
1233 // Atomic
1234 Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID :
1236 Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
1240
1241 switch (IntrID) {
1242 default:
1243 // XXX - Should this be volatile without known ordering?
1245 break;
1246 case Intrinsic::amdgcn_raw_buffer_load_lds:
1247 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1248 case Intrinsic::amdgcn_struct_buffer_load_lds:
1249 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1250 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1251 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1252 Info.ptrVal = CI.getArgOperand(1);
1253 return true;
1254 }
1255 }
1256 }
1257 return true;
1258 }
1259
1260 switch (IntrID) {
1261 case Intrinsic::amdgcn_ds_ordered_add:
1262 case Intrinsic::amdgcn_ds_ordered_swap:
1263 case Intrinsic::amdgcn_ds_fadd:
1264 case Intrinsic::amdgcn_ds_fmin:
1265 case Intrinsic::amdgcn_ds_fmax: {
1267 Info.memVT = MVT::getVT(CI.getType());
1268 Info.ptrVal = CI.getOperand(0);
1269 Info.align.reset();
1271
1272 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));
1273 if (!Vol->isZero())
1275
1276 return true;
1277 }
1278 case Intrinsic::amdgcn_buffer_atomic_fadd: {
1280 Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
1281 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1282 Info.align.reset();
1284
1285 const ConstantInt *Vol = dyn_cast<ConstantInt>(CI.getOperand(4));
1286 if (!Vol || !Vol->isZero())
1288
1289 return true;
1290 }
1291 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1292 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1294 Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
1295 Info.ptrVal = nullptr;
1296 Info.fallbackAddressSpace = AMDGPUAS::STREAMOUT_REGISTER;
1298 return true;
1299 }
1300 case Intrinsic::amdgcn_ds_append:
1301 case Intrinsic::amdgcn_ds_consume: {
1303 Info.memVT = MVT::getVT(CI.getType());
1304 Info.ptrVal = CI.getOperand(0);
1305 Info.align.reset();
1307
1308 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));
1309 if (!Vol->isZero())
1311
1312 return true;
1313 }
1314 case Intrinsic::amdgcn_global_atomic_csub: {
1316 Info.memVT = MVT::getVT(CI.getType());
1317 Info.ptrVal = CI.getOperand(0);
1318 Info.align.reset();
1322 return true;
1323 }
1324 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
1326 Info.memVT = MVT::getVT(CI.getType()); // XXX: what is correct VT?
1327
1328 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1329 Info.align.reset();
1332 return true;
1333 }
1334 case Intrinsic::amdgcn_global_atomic_fadd:
1335 case Intrinsic::amdgcn_global_atomic_fmin:
1336 case Intrinsic::amdgcn_global_atomic_fmax:
1337 case Intrinsic::amdgcn_global_atomic_fmin_num:
1338 case Intrinsic::amdgcn_global_atomic_fmax_num:
1339 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1340 case Intrinsic::amdgcn_flat_atomic_fadd:
1341 case Intrinsic::amdgcn_flat_atomic_fmin:
1342 case Intrinsic::amdgcn_flat_atomic_fmax:
1343 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1344 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1345 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
1346 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1347 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: {
1349 Info.memVT = MVT::getVT(CI.getType());
1350 Info.ptrVal = CI.getOperand(0);
1351 Info.align.reset();
1356 return true;
1357 }
1358 case Intrinsic::amdgcn_global_load_tr_b64:
1359 case Intrinsic::amdgcn_global_load_tr_b128: {
1361 Info.memVT = MVT::getVT(CI.getType());
1362 Info.ptrVal = CI.getOperand(0);
1363 Info.align.reset();
1365 return true;
1366 }
1367 case Intrinsic::amdgcn_ds_gws_init:
1368 case Intrinsic::amdgcn_ds_gws_barrier:
1369 case Intrinsic::amdgcn_ds_gws_sema_v:
1370 case Intrinsic::amdgcn_ds_gws_sema_br:
1371 case Intrinsic::amdgcn_ds_gws_sema_p:
1372 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1374
1375 const GCNTargetMachine &TM =
1376 static_cast<const GCNTargetMachine &>(getTargetMachine());
1377
1379 Info.ptrVal = MFI->getGWSPSV(TM);
1380
1381 // This is an abstract access, but we need to specify a type and size.
1382 Info.memVT = MVT::i32;
1383 Info.size = 4;
1384 Info.align = Align(4);
1385
1386 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1388 else
1390 return true;
1391 }
1392 case Intrinsic::amdgcn_global_load_lds: {
1394 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1395 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1396 Info.ptrVal = CI.getArgOperand(1);
1398 return true;
1399 }
1400 case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
1402
1403 const GCNTargetMachine &TM =
1404 static_cast<const GCNTargetMachine &>(getTargetMachine());
1405
1407 Info.ptrVal = MFI->getGWSPSV(TM);
1408
1409 // This is an abstract access, but we need to specify a type and size.
1410 Info.memVT = MVT::i32;
1411 Info.size = 4;
1412 Info.align = Align(4);
1413
1415 return true;
1416 }
1417 default:
1418 return false;
1419 }
1420}
1421
1423 const CallInst &I, SmallVectorImpl<SDValue> &Ops, SelectionDAG &DAG) const {
1424 switch (cast<IntrinsicInst>(I).getIntrinsicID()) {
1425 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1426 // The DAG's ValueType loses the addrspaces.
1427 // Add them as 2 extra Constant operands "from" and "to".
1428 unsigned SrcAS = I.getOperand(0)->getType()->getPointerAddressSpace();
1429 unsigned DstAS = I.getType()->getPointerAddressSpace();
1430 Ops.push_back(DAG.getTargetConstant(SrcAS, SDLoc(), MVT::i32));
1431 Ops.push_back(DAG.getTargetConstant(DstAS, SDLoc(), MVT::i32));
1432 break;
1433 }
1434 default:
1435 break;
1436 }
1437}
1438
1441 Type *&AccessTy) const {
1442 Value *Ptr = nullptr;
1443 switch (II->getIntrinsicID()) {
1444 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1445 case Intrinsic::amdgcn_ds_append:
1446 case Intrinsic::amdgcn_ds_consume:
1447 case Intrinsic::amdgcn_ds_fadd:
1448 case Intrinsic::amdgcn_ds_fmax:
1449 case Intrinsic::amdgcn_ds_fmin:
1450 case Intrinsic::amdgcn_ds_ordered_add:
1451 case Intrinsic::amdgcn_ds_ordered_swap:
1452 case Intrinsic::amdgcn_flat_atomic_fadd:
1453 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
1454 case Intrinsic::amdgcn_flat_atomic_fmax:
1455 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1456 case Intrinsic::amdgcn_flat_atomic_fmin:
1457 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1458 case Intrinsic::amdgcn_global_atomic_csub:
1459 case Intrinsic::amdgcn_global_atomic_fadd:
1460 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
1461 case Intrinsic::amdgcn_global_atomic_fmax:
1462 case Intrinsic::amdgcn_global_atomic_fmax_num:
1463 case Intrinsic::amdgcn_global_atomic_fmin:
1464 case Intrinsic::amdgcn_global_atomic_fmin_num:
1465 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1466 case Intrinsic::amdgcn_global_load_tr_b64:
1467 case Intrinsic::amdgcn_global_load_tr_b128:
1468 Ptr = II->getArgOperand(0);
1469 break;
1470 case Intrinsic::amdgcn_global_load_lds:
1471 Ptr = II->getArgOperand(1);
1472 break;
1473 default:
1474 return false;
1475 }
1476 AccessTy = II->getType();
1477 Ops.push_back(Ptr);
1478 return true;
1479}
1480
1481bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM,
1482 unsigned AddrSpace,
1483 uint64_t FlatVariant) const {
1484 if (!Subtarget->hasFlatInstOffsets()) {
1485 // Flat instructions do not have offsets, and only have the register
1486 // address.
1487 return AM.BaseOffs == 0 && AM.Scale == 0;
1488 }
1489
1490 return AM.Scale == 0 &&
1491 (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1492 AM.BaseOffs, AddrSpace, FlatVariant));
1493}
1494
1496 if (Subtarget->hasFlatGlobalInsts())
1497 return isLegalFlatAddressingMode(AM, AMDGPUAS::GLOBAL_ADDRESS,
1499
1500 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1501 // Assume the we will use FLAT for all global memory accesses
1502 // on VI.
1503 // FIXME: This assumption is currently wrong. On VI we still use
1504 // MUBUF instructions for the r + i addressing mode. As currently
1505 // implemented, the MUBUF instructions only work on buffer < 4GB.
1506 // It may be possible to support > 4GB buffers with MUBUF instructions,
1507 // by setting the stride value in the resource descriptor which would
1508 // increase the size limit to (stride * 4GB). However, this is risky,
1509 // because it has never been validated.
1510 return isLegalFlatAddressingMode(AM, AMDGPUAS::FLAT_ADDRESS,
1512 }
1513
1514 return isLegalMUBUFAddressingMode(AM);
1515}
1516
1517bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1518 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1519 // additionally can do r + r + i with addr64. 32-bit has more addressing
1520 // mode options. Depending on the resource constant, it can also do
1521 // (i64 r0) + (i32 r1) * (i14 i).
1522 //
1523 // Private arrays end up using a scratch buffer most of the time, so also
1524 // assume those use MUBUF instructions. Scratch loads / stores are currently
1525 // implemented as mubuf instructions with offen bit set, so slightly
1526 // different than the normal addr64.
1527 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1528 if (!TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1529 return false;
1530
1531 // FIXME: Since we can split immediate into soffset and immediate offset,
1532 // would it make sense to allow any immediate?
1533
1534 switch (AM.Scale) {
1535 case 0: // r + i or just i, depending on HasBaseReg.
1536 return true;
1537 case 1:
1538 return true; // We have r + r or r + i.
1539 case 2:
1540 if (AM.HasBaseReg) {
1541 // Reject 2 * r + r.
1542 return false;
1543 }
1544
1545 // Allow 2 * r as r + r
1546 // Or 2 * r + i is allowed as r + r + i.
1547 return true;
1548 default: // Don't allow n * r
1549 return false;
1550 }
1551}
1552
1554 const AddrMode &AM, Type *Ty,
1555 unsigned AS, Instruction *I) const {
1556 // No global is ever allowed as a base.
1557 if (AM.BaseGV)
1558 return false;
1559
1560 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
1561 return isLegalGlobalAddressingMode(AM);
1562
1563 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
1567 // If the offset isn't a multiple of 4, it probably isn't going to be
1568 // correctly aligned.
1569 // FIXME: Can we get the real alignment here?
1570 if (AM.BaseOffs % 4 != 0)
1571 return isLegalMUBUFAddressingMode(AM);
1572
1573 if (!Subtarget->hasScalarSubwordLoads()) {
1574 // There are no SMRD extloads, so if we have to do a small type access we
1575 // will use a MUBUF load.
1576 // FIXME?: We also need to do this if unaligned, but we don't know the
1577 // alignment here.
1578 if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
1579 return isLegalGlobalAddressingMode(AM);
1580 }
1581
1583 // SMRD instructions have an 8-bit, dword offset on SI.
1584 if (!isUInt<8>(AM.BaseOffs / 4))
1585 return false;
1586 } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
1587 // On CI+, this can also be a 32-bit literal constant offset. If it fits
1588 // in 8-bits, it can use a smaller encoding.
1589 if (!isUInt<32>(AM.BaseOffs / 4))
1590 return false;
1591 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX9) {
1592 // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1593 if (!isUInt<20>(AM.BaseOffs))
1594 return false;
1595 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) {
1596 // On GFX9 the offset is signed 21-bit in bytes (but must not be negative
1597 // for S_BUFFER_* instructions).
1598 if (!isInt<21>(AM.BaseOffs))
1599 return false;
1600 } else {
1601 // On GFX12, all offsets are signed 24-bit in bytes.
1602 if (!isInt<24>(AM.BaseOffs))
1603 return false;
1604 }
1605
1606 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1607 return true;
1608
1609 if (AM.Scale == 1 && AM.HasBaseReg)
1610 return true;
1611
1612 return false;
1613 }
1614
1615 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
1616 return Subtarget->enableFlatScratch()
1617 ? isLegalFlatAddressingMode(AM, AMDGPUAS::PRIVATE_ADDRESS,
1619 : isLegalMUBUFAddressingMode(AM);
1620
1621 if (AS == AMDGPUAS::LOCAL_ADDRESS ||
1622 (AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) {
1623 // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1624 // field.
1625 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1626 // an 8-bit dword offset but we don't know the alignment here.
1627 if (!isUInt<16>(AM.BaseOffs))
1628 return false;
1629
1630 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1631 return true;
1632
1633 if (AM.Scale == 1 && AM.HasBaseReg)
1634 return true;
1635
1636 return false;
1637 }
1638
1640 // For an unknown address space, this usually means that this is for some
1641 // reason being used for pure arithmetic, and not based on some addressing
1642 // computation. We don't have instructions that compute pointers with any
1643 // addressing modes, so treat them as having no offset like flat
1644 // instructions.
1645 return isLegalFlatAddressingMode(AM, AMDGPUAS::FLAT_ADDRESS,
1647 }
1648
1649 // Assume a user alias of global for unknown address spaces.
1650 return isLegalGlobalAddressingMode(AM);
1651}
1652
1654 const MachineFunction &MF) const {
1656 return (MemVT.getSizeInBits() <= 4 * 32);
1657 } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1658 unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
1659 return (MemVT.getSizeInBits() <= MaxPrivateBits);
1660 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1661 return (MemVT.getSizeInBits() <= 2 * 32);
1662 }
1663 return true;
1664}
1665
1667 unsigned Size, unsigned AddrSpace, Align Alignment,
1668 MachineMemOperand::Flags Flags, unsigned *IsFast) const {
1669 if (IsFast)
1670 *IsFast = 0;
1671
1672 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1673 AddrSpace == AMDGPUAS::REGION_ADDRESS) {
1674 // Check if alignment requirements for ds_read/write instructions are
1675 // disabled.
1676 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4))
1677 return false;
1678
1679 Align RequiredAlignment(PowerOf2Ceil(Size/8)); // Natural alignment.
1680 if (Subtarget->hasLDSMisalignedBug() && Size > 32 &&
1681 Alignment < RequiredAlignment)
1682 return false;
1683
1684 // Either, the alignment requirements are "enabled", or there is an
1685 // unaligned LDS access related hardware bug though alignment requirements
1686 // are "disabled". In either case, we need to check for proper alignment
1687 // requirements.
1688 //
1689 switch (Size) {
1690 case 64:
1691 // SI has a hardware bug in the LDS / GDS bounds checking: if the base
1692 // address is negative, then the instruction is incorrectly treated as
1693 // out-of-bounds even if base + offsets is in bounds. Split vectorized
1694 // loads here to avoid emitting ds_read2_b32. We may re-combine the
1695 // load later in the SILoadStoreOptimizer.
1696 if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8))
1697 return false;
1698
1699 // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
1700 // can do a 4 byte aligned, 8 byte access in a single operation using
1701 // ds_read2/write2_b32 with adjacent offsets.
1702 RequiredAlignment = Align(4);
1703
1704 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1705 // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/
1706 // ds_write2_b32 depending on the alignment. In either case with either
1707 // alignment there is no faster way of doing this.
1708
1709 // The numbers returned here and below are not additive, it is a 'speed
1710 // rank'. They are just meant to be compared to decide if a certain way
1711 // of lowering an operation is faster than another. For that purpose
1712 // naturally aligned operation gets it bitsize to indicate that "it
1713 // operates with a speed comparable to N-bit wide load". With the full
1714 // alignment ds128 is slower than ds96 for example. If underaligned it
1715 // is comparable to a speed of a single dword access, which would then
1716 // mean 32 < 128 and it is faster to issue a wide load regardless.
1717 // 1 is simply "slow, don't do it". I.e. comparing an aligned load to a
1718 // wider load which will not be aligned anymore the latter is slower.
1719 if (IsFast)
1720 *IsFast = (Alignment >= RequiredAlignment) ? 64
1721 : (Alignment < Align(4)) ? 32
1722 : 1;
1723 return true;
1724 }
1725
1726 break;
1727 case 96:
1728 if (!Subtarget->hasDS96AndDS128())
1729 return false;
1730
1731 // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
1732 // gfx8 and older.
1733
1734 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1735 // Naturally aligned access is fastest. However, also report it is Fast
1736 // if memory is aligned less than DWORD. A narrow load or store will be
1737 // be equally slow as a single ds_read_b96/ds_write_b96, but there will
1738 // be more of them, so overall we will pay less penalty issuing a single
1739 // instruction.
1740
1741 // See comment on the values above.
1742 if (IsFast)
1743 *IsFast = (Alignment >= RequiredAlignment) ? 96
1744 : (Alignment < Align(4)) ? 32
1745 : 1;
1746 return true;
1747 }
1748
1749 break;
1750 case 128:
1751 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
1752 return false;
1753
1754 // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
1755 // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a
1756 // single operation using ds_read2/write2_b64.
1757 RequiredAlignment = Align(8);
1758
1759 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1760 // Naturally aligned access is fastest. However, also report it is Fast
1761 // if memory is aligned less than DWORD. A narrow load or store will be
1762 // be equally slow as a single ds_read_b128/ds_write_b128, but there
1763 // will be more of them, so overall we will pay less penalty issuing a
1764 // single instruction.
1765
1766 // See comment on the values above.
1767 if (IsFast)
1768 *IsFast = (Alignment >= RequiredAlignment) ? 128
1769 : (Alignment < Align(4)) ? 32
1770 : 1;
1771 return true;
1772 }
1773
1774 break;
1775 default:
1776 if (Size > 32)
1777 return false;
1778
1779 break;
1780 }
1781
1782 // See comment on the values above.
1783 // Note that we have a single-dword or sub-dword here, so if underaligned
1784 // it is a slowest possible access, hence returned value is 0.
1785 if (IsFast)
1786 *IsFast = (Alignment >= RequiredAlignment) ? Size : 0;
1787
1788 return Alignment >= RequiredAlignment ||
1789 Subtarget->hasUnalignedDSAccessEnabled();
1790 }
1791
1792 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
1793 bool AlignedBy4 = Alignment >= Align(4);
1794 if (IsFast)
1795 *IsFast = AlignedBy4;
1796
1797 return AlignedBy4 ||
1798 Subtarget->enableFlatScratch() ||
1799 Subtarget->hasUnalignedScratchAccess();
1800 }
1801
1802 // FIXME: We have to be conservative here and assume that flat operations
1803 // will access scratch. If we had access to the IR function, then we
1804 // could determine if any private memory was used in the function.
1805 if (AddrSpace == AMDGPUAS::FLAT_ADDRESS &&
1806 !Subtarget->hasUnalignedScratchAccess()) {
1807 bool AlignedBy4 = Alignment >= Align(4);
1808 if (IsFast)
1809 *IsFast = AlignedBy4;
1810
1811 return AlignedBy4;
1812 }
1813
1814 // So long as they are correct, wide global memory operations perform better
1815 // than multiple smaller memory ops -- even when misaligned
1816 if (AMDGPU::isExtendedGlobalAddrSpace(AddrSpace)) {
1817 if (IsFast)
1818 *IsFast = Size;
1819
1820 return Alignment >= Align(4) ||
1822 }
1823
1824 // Smaller than dword value must be aligned.
1825 if (Size < 32)
1826 return false;
1827
1828 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
1829 // byte-address are ignored, thus forcing Dword alignment.
1830 // This applies to private, global, and constant memory.
1831 if (IsFast)
1832 *IsFast = 1;
1833
1834 return Size >= 32 && Alignment >= Align(4);
1835}
1836
1838 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
1839 unsigned *IsFast) const {
1841 Alignment, Flags, IsFast);
1842}
1843
1845 const MemOp &Op, const AttributeList &FuncAttributes) const {
1846 // FIXME: Should account for address space here.
1847
1848 // The default fallback uses the private pointer size as a guess for a type to
1849 // use. Make sure we switch these to 64-bit accesses.
1850
1851 if (Op.size() >= 16 &&
1852 Op.isDstAligned(Align(4))) // XXX: Should only do for global
1853 return MVT::v4i32;
1854
1855 if (Op.size() >= 8 && Op.isDstAligned(Align(4)))
1856 return MVT::v2i32;
1857
1858 // Use the default.
1859 return MVT::Other;
1860}
1861
1863 const MemSDNode *MemNode = cast<MemSDNode>(N);
1864 return MemNode->getMemOperand()->getFlags() & MONoClobber;
1865}
1866
1868 return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS ||
1870}
1871
1873 unsigned DestAS) const {
1874 // Flat -> private/local is a simple truncate.
1875 // Flat -> global is no-op
1876 if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
1877 return true;
1878
1879 const GCNTargetMachine &TM =
1880 static_cast<const GCNTargetMachine &>(getTargetMachine());
1881 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
1882}
1883
1885 const MemSDNode *MemNode = cast<MemSDNode>(N);
1886
1888}
1889
1892 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
1893 VT.getScalarType().bitsLE(MVT::i16))
1896}
1897
1899 Type *Ty) const {
1900 // FIXME: Could be smarter if called for vector constants.
1901 return true;
1902}
1903
1905 unsigned Index) const {
1907 return false;
1908
1909 // TODO: Add more cases that are cheap.
1910 return Index == 0;
1911}
1912
1914 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
1915 switch (Op) {
1916 case ISD::LOAD:
1917 case ISD::STORE:
1918
1919 // These operations are done with 32-bit instructions anyway.
1920 case ISD::AND:
1921 case ISD::OR:
1922 case ISD::XOR:
1923 case ISD::SELECT:
1924 // TODO: Extensions?
1925 return true;
1926 default:
1927 return false;
1928 }
1929 }
1930
1931 // SimplifySetCC uses this function to determine whether or not it should
1932 // create setcc with i1 operands. We don't have instructions for i1 setcc.
1933 if (VT == MVT::i1 && Op == ISD::SETCC)
1934 return false;
1935
1937}
1938
1939SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
1940 const SDLoc &SL,
1941 SDValue Chain,
1942 uint64_t Offset) const {
1943 const DataLayout &DL = DAG.getDataLayout();
1946
1947 const ArgDescriptor *InputPtrReg;
1948 const TargetRegisterClass *RC;
1949 LLT ArgTy;
1951
1952 std::tie(InputPtrReg, RC, ArgTy) =
1954
1955 // We may not have the kernarg segment argument if we have no kernel
1956 // arguments.
1957 if (!InputPtrReg)
1958 return DAG.getConstant(Offset, SL, PtrVT);
1959
1961 SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
1962 MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
1963
1964 return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Offset));
1965}
1966
1967SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
1968 const SDLoc &SL) const {
1971 return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
1972}
1973
1974SDValue SITargetLowering::getLDSKernelId(SelectionDAG &DAG,
1975 const SDLoc &SL) const {
1976
1978 std::optional<uint32_t> KnownSize =
1980 if (KnownSize.has_value())
1981 return DAG.getConstant(*KnownSize, SL, MVT::i32);
1982 return SDValue();
1983}
1984
1985SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
1986 const SDLoc &SL, SDValue Val,
1987 bool Signed,
1988 const ISD::InputArg *Arg) const {
1989 // First, if it is a widened vector, narrow it.
1990 if (VT.isVector() &&
1992 EVT NarrowedVT =
1995 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val,
1996 DAG.getConstant(0, SL, MVT::i32));
1997 }
1998
1999 // Then convert the vector elements or scalar value.
2000 if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) &&
2001 VT.bitsLT(MemVT)) {
2002 unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
2003 Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
2004 }
2005
2006 if (MemVT.isFloatingPoint())
2007 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2008 else if (Signed)
2009 Val = DAG.getSExtOrTrunc(Val, SL, VT);
2010 else
2011 Val = DAG.getZExtOrTrunc(Val, SL, VT);
2012
2013 return Val;
2014}
2015
2016SDValue SITargetLowering::lowerKernargMemParameter(
2017 SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
2018 uint64_t Offset, Align Alignment, bool Signed,
2019 const ISD::InputArg *Arg) const {
2021
2022 // Try to avoid using an extload by loading earlier than the argument address,
2023 // and extracting the relevant bits. The load should hopefully be merged with
2024 // the previous argument.
2025 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2026 // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
2027 int64_t AlignDownOffset = alignDown(Offset, 4);
2028 int64_t OffsetDiff = Offset - AlignDownOffset;
2029
2030 EVT IntVT = MemVT.changeTypeToInteger();
2031
2032 // TODO: If we passed in the base kernel offset we could have a better
2033 // alignment than 4, but we don't really need it.
2034 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2035 SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, Align(4),
2038
2039 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
2040 SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
2041
2042 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
2043 ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
2044 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
2045
2046
2047 return DAG.getMergeValues({ ArgVal, Load.getValue(1) }, SL);
2048 }
2049
2050 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
2051 SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Alignment,
2054
2055 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
2056 return DAG.getMergeValues({ Val, Load.getValue(1) }, SL);
2057}
2058
2059SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
2060 const SDLoc &SL, SDValue Chain,
2061 const ISD::InputArg &Arg) const {
2063 MachineFrameInfo &MFI = MF.getFrameInfo();
2064
2065 if (Arg.Flags.isByVal()) {
2066 unsigned Size = Arg.Flags.getByValSize();
2067 int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
2068 return DAG.getFrameIndex(FrameIdx, MVT::i32);
2069 }
2070
2071 unsigned ArgOffset = VA.getLocMemOffset();
2072 unsigned ArgSize = VA.getValVT().getStoreSize();
2073
2074 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
2075
2076 // Create load nodes to retrieve arguments from the stack.
2077 SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
2078 SDValue ArgValue;
2079
2080 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
2082 MVT MemVT = VA.getValVT();
2083
2084 switch (VA.getLocInfo()) {
2085 default:
2086 break;
2087 case CCValAssign::BCvt:
2088 MemVT = VA.getLocVT();
2089 break;
2090 case CCValAssign::SExt:
2091 ExtType = ISD::SEXTLOAD;
2092 break;
2093 case CCValAssign::ZExt:
2094 ExtType = ISD::ZEXTLOAD;
2095 break;
2096 case CCValAssign::AExt:
2097 ExtType = ISD::EXTLOAD;
2098 break;
2099 }
2100
2101 ArgValue = DAG.getExtLoad(
2102 ExtType, SL, VA.getLocVT(), Chain, FIN,
2104 MemVT);
2105 return ArgValue;
2106}
2107
2108SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG,
2109 const SIMachineFunctionInfo &MFI,
2110 EVT VT,
2112 const ArgDescriptor *Reg = nullptr;
2113 const TargetRegisterClass *RC;
2114 LLT Ty;
2115
2117 const ArgDescriptor WorkGroupIDX =
2118 ArgDescriptor::createRegister(AMDGPU::TTMP9);
2119 // If GridZ is not programmed in an entry function then the hardware will set
2120 // it to all zeros, so there is no need to mask the GridY value in the low
2121 // order bits.
2122 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
2123 AMDGPU::TTMP7,
2124 AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
2125 const ArgDescriptor WorkGroupIDZ =
2126 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
2127 if (Subtarget->hasArchitectedSGPRs() &&
2129 switch (PVID) {
2131 Reg = &WorkGroupIDX;
2132 RC = &AMDGPU::SReg_32RegClass;
2133 Ty = LLT::scalar(32);
2134 break;
2136 Reg = &WorkGroupIDY;
2137 RC = &AMDGPU::SReg_32RegClass;
2138 Ty = LLT::scalar(32);
2139 break;
2141 Reg = &WorkGroupIDZ;
2142 RC = &AMDGPU::SReg_32RegClass;
2143 Ty = LLT::scalar(32);
2144 break;
2145 default:
2146 break;
2147 }
2148 }
2149
2150 if (!Reg)
2151 std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
2152 if (!Reg) {
2154 // It's possible for a kernarg intrinsic call to appear in a kernel with
2155 // no allocated segment, in which case we do not add the user sgpr
2156 // argument, so just return null.
2157 return DAG.getConstant(0, SDLoc(), VT);
2158 }
2159
2160 // It's undefined behavior if a function marked with the amdgpu-no-*
2161 // attributes uses the corresponding intrinsic.
2162 return DAG.getUNDEF(VT);
2163 }
2164
2165 return loadInputValue(DAG, RC, VT, SDLoc(DAG.getEntryNode()), *Reg);
2166}
2167
2169 CallingConv::ID CallConv,
2170 ArrayRef<ISD::InputArg> Ins, BitVector &Skipped,
2171 FunctionType *FType,
2172 SIMachineFunctionInfo *Info) {
2173 for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
2174 const ISD::InputArg *Arg = &Ins[I];
2175
2176 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2177 "vector type argument should have been split");
2178
2179 // First check if it's a PS input addr.
2180 if (CallConv == CallingConv::AMDGPU_PS &&
2181 !Arg->Flags.isInReg() && PSInputNum <= 15) {
2182 bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
2183
2184 // Inconveniently only the first part of the split is marked as isSplit,
2185 // so skip to the end. We only want to increment PSInputNum once for the
2186 // entire split argument.
2187 if (Arg->Flags.isSplit()) {
2188 while (!Arg->Flags.isSplitEnd()) {
2189 assert((!Arg->VT.isVector() ||
2190 Arg->VT.getScalarSizeInBits() == 16) &&
2191 "unexpected vector split in ps argument type");
2192 if (!SkipArg)
2193 Splits.push_back(*Arg);
2194 Arg = &Ins[++I];
2195 }
2196 }
2197
2198 if (SkipArg) {
2199 // We can safely skip PS inputs.
2200 Skipped.set(Arg->getOrigArgIndex());
2201 ++PSInputNum;
2202 continue;
2203 }
2204
2205 Info->markPSInputAllocated(PSInputNum);
2206 if (Arg->Used)
2207 Info->markPSInputEnabled(PSInputNum);
2208
2209 ++PSInputNum;
2210 }
2211
2212 Splits.push_back(*Arg);
2213 }
2214}
2215
2216// Allocate special inputs passed in VGPRs.
2218 MachineFunction &MF,
2219 const SIRegisterInfo &TRI,
2220 SIMachineFunctionInfo &Info) const {
2221 const LLT S32 = LLT::scalar(32);
2223
2224 if (Info.hasWorkItemIDX()) {
2225 Register Reg = AMDGPU::VGPR0;
2226 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2227
2228 CCInfo.AllocateReg(Reg);
2229 unsigned Mask = (Subtarget->hasPackedTID() &&
2230 Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2231 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2232 }
2233
2234 if (Info.hasWorkItemIDY()) {
2235 assert(Info.hasWorkItemIDX());
2236 if (Subtarget->hasPackedTID()) {
2237 Info.setWorkItemIDY(ArgDescriptor::createRegister(AMDGPU::VGPR0,
2238 0x3ff << 10));
2239 } else {
2240 unsigned Reg = AMDGPU::VGPR1;
2241 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2242
2243 CCInfo.AllocateReg(Reg);
2244 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
2245 }
2246 }
2247
2248 if (Info.hasWorkItemIDZ()) {
2249 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2250 if (Subtarget->hasPackedTID()) {
2251 Info.setWorkItemIDZ(ArgDescriptor::createRegister(AMDGPU::VGPR0,
2252 0x3ff << 20));
2253 } else {
2254 unsigned Reg = AMDGPU::VGPR2;
2255 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2256
2257 CCInfo.AllocateReg(Reg);
2258 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
2259 }
2260 }
2261}
2262
2263// Try to allocate a VGPR at the end of the argument list, or if no argument
2264// VGPRs are left allocating a stack slot.
2265// If \p Mask is is given it indicates bitfield position in the register.
2266// If \p Arg is given use it with new ]p Mask instead of allocating new.
2267static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
2268 ArgDescriptor Arg = ArgDescriptor()) {
2269 if (Arg.isSet())
2270 return ArgDescriptor::createArg(Arg, Mask);
2271
2272 ArrayRef<MCPhysReg> ArgVGPRs = ArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
2273 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
2274 if (RegIdx == ArgVGPRs.size()) {
2275 // Spill to stack required.
2276 int64_t Offset = CCInfo.AllocateStack(4, Align(4));
2277
2278 return ArgDescriptor::createStack(Offset, Mask);
2279 }
2280
2281 unsigned Reg = ArgVGPRs[RegIdx];
2282 Reg = CCInfo.AllocateReg(Reg);
2283 assert(Reg != AMDGPU::NoRegister);
2284
2285 MachineFunction &MF = CCInfo.getMachineFunction();
2286 Register LiveInVReg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
2287 MF.getRegInfo().setType(LiveInVReg, LLT::scalar(32));
2288 return ArgDescriptor::createRegister(Reg, Mask);
2289}
2290
2292 const TargetRegisterClass *RC,
2293 unsigned NumArgRegs) {
2294 ArrayRef<MCPhysReg> ArgSGPRs = ArrayRef(RC->begin(), 32);
2295 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
2296 if (RegIdx == ArgSGPRs.size())
2297 report_fatal_error("ran out of SGPRs for arguments");
2298
2299 unsigned Reg = ArgSGPRs[RegIdx];
2300 Reg = CCInfo.AllocateReg(Reg);
2301 assert(Reg != AMDGPU::NoRegister);
2302
2303 MachineFunction &MF = CCInfo.getMachineFunction();
2304 MF.addLiveIn(Reg, RC);
2306}
2307
2308// If this has a fixed position, we still should allocate the register in the
2309// CCInfo state. Technically we could get away with this for values passed
2310// outside of the normal argument range.
2312 const TargetRegisterClass *RC,
2313 MCRegister Reg) {
2314 Reg = CCInfo.AllocateReg(Reg);
2315 assert(Reg != AMDGPU::NoRegister);
2316 MachineFunction &MF = CCInfo.getMachineFunction();
2317 MF.addLiveIn(Reg, RC);
2318}
2319
2320static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg) {
2321 if (Arg) {
2322 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_32RegClass,
2323 Arg.getRegister());
2324 } else
2325 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
2326}
2327
2328static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg) {
2329 if (Arg) {
2330 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_64RegClass,
2331 Arg.getRegister());
2332 } else
2333 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
2334}
2335
2336/// Allocate implicit function VGPR arguments at the end of allocated user
2337/// arguments.
2339 CCState &CCInfo, MachineFunction &MF,
2340 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2341 const unsigned Mask = 0x3ff;
2342 ArgDescriptor Arg;
2343
2344 if (Info.hasWorkItemIDX()) {
2345 Arg = allocateVGPR32Input(CCInfo, Mask);
2346 Info.setWorkItemIDX(Arg);
2347 }
2348
2349 if (Info.hasWorkItemIDY()) {
2350 Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg);
2351 Info.setWorkItemIDY(Arg);
2352 }
2353
2354 if (Info.hasWorkItemIDZ())
2355 Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg));
2356}
2357
2358/// Allocate implicit function VGPR arguments in fixed registers.
2360 CCState &CCInfo, MachineFunction &MF,
2361 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2362 Register Reg = CCInfo.AllocateReg(AMDGPU::VGPR31);
2363 if (!Reg)
2364 report_fatal_error("failed to allocated VGPR for implicit arguments");
2365
2366 const unsigned Mask = 0x3ff;
2367 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2368 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask << 10));
2369 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask << 20));
2370}
2371
2373 CCState &CCInfo,
2374 MachineFunction &MF,
2375 const SIRegisterInfo &TRI,
2376 SIMachineFunctionInfo &Info) const {
2377 auto &ArgInfo = Info.getArgInfo();
2378 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2379
2380 // TODO: Unify handling with private memory pointers.
2381 if (UserSGPRInfo.hasDispatchPtr())
2382 allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);
2383
2384 const Module *M = MF.getFunction().getParent();
2385 if (UserSGPRInfo.hasQueuePtr() &&
2387 allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
2388
2389 // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
2390 // constant offset from the kernarg segment.
2391 if (Info.hasImplicitArgPtr())
2392 allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr);
2393
2394 if (UserSGPRInfo.hasDispatchID())
2395 allocateSGPR64Input(CCInfo, ArgInfo.DispatchID);
2396
2397 // flat_scratch_init is not applicable for non-kernel functions.
2398
2399 if (Info.hasWorkGroupIDX())
2400 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX);
2401
2402 if (Info.hasWorkGroupIDY())
2403 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY);
2404
2405 if (Info.hasWorkGroupIDZ())
2406 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ);
2407
2408 if (Info.hasLDSKernelId())
2409 allocateSGPR32Input(CCInfo, ArgInfo.LDSKernelId);
2410}
2411
2412// Allocate special inputs passed in user SGPRs.
2414 MachineFunction &MF,
2415 const SIRegisterInfo &TRI,
2416 SIMachineFunctionInfo &Info) const {
2417 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2418 if (UserSGPRInfo.hasImplicitBufferPtr()) {
2419 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
2420 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2421 CCInfo.AllocateReg(ImplicitBufferPtrReg);
2422 }
2423
2424 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
2425 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
2426 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
2427 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2428 CCInfo.AllocateReg(PrivateSegmentBufferReg);
2429 }
2430
2431 if (UserSGPRInfo.hasDispatchPtr()) {
2432 Register DispatchPtrReg = Info.addDispatchPtr(TRI);
2433 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2434 CCInfo.AllocateReg(DispatchPtrReg);
2435 }
2436
2437 const Module *M = MF.getFunction().getParent();
2438 if (UserSGPRInfo.hasQueuePtr() &&
2440 Register QueuePtrReg = Info.addQueuePtr(TRI);
2441 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2442 CCInfo.AllocateReg(QueuePtrReg);
2443 }
2444
2445 if (UserSGPRInfo.hasKernargSegmentPtr()) {
2447 Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
2448 CCInfo.AllocateReg(InputPtrReg);
2449
2450 Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
2451 MRI.setType(VReg, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2452 }
2453
2454 if (UserSGPRInfo.hasDispatchID()) {
2455 Register DispatchIDReg = Info.addDispatchID(TRI);
2456 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2457 CCInfo.AllocateReg(DispatchIDReg);
2458 }
2459
2460 if (UserSGPRInfo.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
2461 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
2462 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2463 CCInfo.AllocateReg(FlatScratchInitReg);
2464 }
2465
2466 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
2467 // these from the dispatch pointer.
2468}
2469
2470// Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
2471// sequential starting from the first argument.
2473 CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
2475 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2476 Function &F = MF.getFunction();
2477 unsigned LastExplicitArgOffset =
2478 MF.getSubtarget<GCNSubtarget>().getExplicitKernelArgOffset();
2479 GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
2480 bool InPreloadSequence = true;
2481 unsigned InIdx = 0;
2482 for (auto &Arg : F.args()) {
2483 if (!InPreloadSequence || !Arg.hasInRegAttr())
2484 break;
2485
2486 int ArgIdx = Arg.getArgNo();
2487 // Don't preload non-original args or parts not in the current preload
2488 // sequence.
2489 if (InIdx < Ins.size() && (!Ins[InIdx].isOrigArg() ||
2490 (int)Ins[InIdx].getOrigArgIndex() != ArgIdx))
2491 break;
2492
2493 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2494 (int)Ins[InIdx].getOrigArgIndex() == ArgIdx;
2495 InIdx++) {
2496 assert(ArgLocs[ArgIdx].isMemLoc());
2497 auto &ArgLoc = ArgLocs[InIdx];
2498 const Align KernelArgBaseAlign = Align(16);
2499 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2500 Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);
2501 unsigned NumAllocSGPRs =
2502 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2503
2504 // Arg is preloaded into the previous SGPR.
2505 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2506 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2507 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2508 continue;
2509 }
2510
2511 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2512 unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
2513 // Check for free user SGPRs for preloading.
2514 if (PaddingSGPRs + NumAllocSGPRs + 1 /*Synthetic SGPRs*/ >
2515 SGPRInfo.getNumFreeUserSGPRs()) {
2516 InPreloadSequence = false;
2517 break;
2518 }
2519
2520 // Preload this argument.
2521 const TargetRegisterClass *RC =
2522 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
2523 SmallVectorImpl<MCRegister> *PreloadRegs =
2524 Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
2525
2526 if (PreloadRegs->size() > 1)
2527 RC = &AMDGPU::SGPR_32RegClass;
2528 for (auto &Reg : *PreloadRegs) {
2529 assert(Reg);
2530 MF.addLiveIn(Reg, RC);
2531 CCInfo.AllocateReg(Reg);
2532 }
2533
2534 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
2535 }
2536 }
2537}
2538
2540 const SIRegisterInfo &TRI,
2541 SIMachineFunctionInfo &Info) const {
2542 // Always allocate this last since it is a synthetic preload.
2543 if (Info.hasLDSKernelId()) {
2544 Register Reg = Info.addLDSKernelId();
2545 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2546 CCInfo.AllocateReg(Reg);
2547 }
2548}
2549
2550// Allocate special input registers that are initialized per-wave.
2552 MachineFunction &MF,
2554 CallingConv::ID CallConv,
2555 bool IsShader) const {
2556 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
2557 if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) {
2558 // Note: user SGPRs are handled by the front-end for graphics shaders
2559 // Pad up the used user SGPRs with dead inputs.
2560
2561 // TODO: NumRequiredSystemSGPRs computation should be adjusted appropriately
2562 // before enabling architected SGPRs for workgroup IDs.
2563 assert(!HasArchitectedSGPRs && "Unhandled feature for the subtarget");
2564
2565 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
2566 // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to
2567 // rely on it to reach 16 since if we end up having no stack usage, it will
2568 // not really be added.
2569 unsigned NumRequiredSystemSGPRs = Info.hasWorkGroupIDX() +
2570 Info.hasWorkGroupIDY() +
2571 Info.hasWorkGroupIDZ() +
2572 Info.hasWorkGroupInfo();
2573 for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
2574 Register Reg = Info.addReservedUserSGPR();
2575 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2576 CCInfo.AllocateReg(Reg);
2577 }
2578 }
2579
2580 if (!HasArchitectedSGPRs) {
2581 if (Info.hasWorkGroupIDX()) {
2582 Register Reg = Info.addWorkGroupIDX();
2583 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2584 CCInfo.AllocateReg(Reg);
2585 }
2586
2587 if (Info.hasWorkGroupIDY()) {
2588 Register Reg = Info.addWorkGroupIDY();
2589 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2590 CCInfo.AllocateReg(Reg);
2591 }
2592
2593 if (Info.hasWorkGroupIDZ()) {
2594 Register Reg = Info.addWorkGroupIDZ();
2595 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2596 CCInfo.AllocateReg(Reg);
2597 }
2598 }
2599
2600 if (Info.hasWorkGroupInfo()) {
2601 Register Reg = Info.addWorkGroupInfo();
2602 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2603 CCInfo.AllocateReg(Reg);
2604 }
2605
2606 if (Info.hasPrivateSegmentWaveByteOffset()) {
2607 // Scratch wave offset passed in system SGPR.
2608 unsigned PrivateSegmentWaveByteOffsetReg;
2609
2610 if (IsShader) {
2611 PrivateSegmentWaveByteOffsetReg =
2612 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
2613
2614 // This is true if the scratch wave byte offset doesn't have a fixed
2615 // location.
2616 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
2617 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
2618 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
2619 }
2620 } else
2621 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
2622
2623 MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
2624 CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
2625 }
2626
2627 assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader ||
2628 Info.getNumPreloadedSGPRs() >= 16);
2629}
2630
2632 MachineFunction &MF,
2633 const SIRegisterInfo &TRI,
2634 SIMachineFunctionInfo &Info) {
2635 // Now that we've figured out where the scratch register inputs are, see if
2636 // should reserve the arguments and use them directly.
2637 MachineFrameInfo &MFI = MF.getFrameInfo();
2638 bool HasStackObjects = MFI.hasStackObjects();
2639 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2640
2641 // Record that we know we have non-spill stack objects so we don't need to
2642 // check all stack objects later.
2643 if (HasStackObjects)
2644 Info.setHasNonSpillStackObjects(true);
2645
2646 // Everything live out of a block is spilled with fast regalloc, so it's
2647 // almost certain that spilling will be required.
2648 if (TM.getOptLevel() == CodeGenOptLevel::None)
2649 HasStackObjects = true;
2650
2651 // For now assume stack access is needed in any callee functions, so we need
2652 // the scratch registers to pass in.
2653 bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
2654
2655 if (!ST.enableFlatScratch()) {
2656 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
2657 // If we have stack objects, we unquestionably need the private buffer
2658 // resource. For the Code Object V2 ABI, this will be the first 4 user
2659 // SGPR inputs. We can reserve those and use them directly.
2660
2661 Register PrivateSegmentBufferReg =
2663 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
2664 } else {
2665 unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
2666 // We tentatively reserve the last registers (skipping the last registers
2667 // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
2668 // we'll replace these with the ones immediately after those which were
2669 // really allocated. In the prologue copies will be inserted from the
2670 // argument to these reserved registers.
2671
2672 // Without HSA, relocations are used for the scratch pointer and the
2673 // buffer resource setup is always inserted in the prologue. Scratch wave
2674 // offset is still in an input SGPR.
2675 Info.setScratchRSrcReg(ReservedBufferReg);
2676 }
2677 }
2678
2680
2681 // For entry functions we have to set up the stack pointer if we use it,
2682 // whereas non-entry functions get this "for free". This means there is no
2683 // intrinsic advantage to using S32 over S34 in cases where we do not have
2684 // calls but do need a frame pointer (i.e. if we are requested to have one
2685 // because frame pointer elimination is disabled). To keep things simple we
2686 // only ever use S32 as the call ABI stack pointer, and so using it does not
2687 // imply we need a separate frame pointer.
2688 //
2689 // Try to use s32 as the SP, but move it if it would interfere with input
2690 // arguments. This won't work with calls though.
2691 //
2692 // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
2693 // registers.
2694 if (!MRI.isLiveIn(AMDGPU::SGPR32)) {
2695 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
2696 } else {
2698
2699 if (MFI.hasCalls())
2700 report_fatal_error("call in graphics shader with too many input SGPRs");
2701
2702 for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
2703 if (!MRI.isLiveIn(Reg)) {
2704 Info.setStackPtrOffsetReg(Reg);
2705 break;
2706 }
2707 }
2708
2709 if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
2710 report_fatal_error("failed to find register for SP");
2711 }
2712
2713 // hasFP should be accurate for entry functions even before the frame is
2714 // finalized, because it does not rely on the known stack size, only
2715 // properties like whether variable sized objects are present.
2716 if (ST.getFrameLowering()->hasFP(MF)) {
2717 Info.setFrameOffsetReg(AMDGPU::SGPR33);
2718 }
2719}
2720
2723 return !Info->isEntryFunction();
2724}
2725
2727
2728}
2729
2731 MachineBasicBlock *Entry,
2732 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
2734
2735 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
2736 if (!IStart)
2737 return;
2738
2739 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2740 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
2741 MachineBasicBlock::iterator MBBI = Entry->begin();
2742 for (const MCPhysReg *I = IStart; *I; ++I) {
2743 const TargetRegisterClass *RC = nullptr;
2744 if (AMDGPU::SReg_64RegClass.contains(*I))
2745 RC = &AMDGPU::SGPR_64RegClass;
2746 else if (AMDGPU::SReg_32RegClass.contains(*I))
2747 RC = &AMDGPU::SGPR_32RegClass;
2748 else
2749 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2750
2751 Register NewVR = MRI->createVirtualRegister(RC);
2752 // Create copy from CSR to a virtual register.
2753 Entry->addLiveIn(*I);
2754 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
2755 .addReg(*I);
2756
2757 // Insert the copy-back instructions right before the terminator.
2758 for (auto *Exit : Exits)
2759 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
2760 TII->get(TargetOpcode::COPY), *I)
2761 .addReg(NewVR);
2762 }
2763}
2764
2766 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2767 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
2768 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2770
2772 const Function &Fn = MF.getFunction();
2775
2776 if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) {
2777 DiagnosticInfoUnsupported NoGraphicsHSA(
2778 Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
2779 DAG.getContext()->diagnose(NoGraphicsHSA);
2780 return DAG.getEntryNode();
2781 }
2782
2785 BitVector Skipped(Ins.size());
2786 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
2787 *DAG.getContext());
2788
2789 bool IsGraphics = AMDGPU::isGraphics(CallConv);
2790 bool IsKernel = AMDGPU::isKernel(CallConv);
2791 bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
2792
2793 if (IsGraphics) {
2794 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo();
2795 assert(!UserSGPRInfo.hasDispatchPtr() &&
2796 !UserSGPRInfo.hasKernargSegmentPtr() && !Info->hasWorkGroupInfo() &&
2797 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
2798 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
2799 (void)UserSGPRInfo;
2800 if (!Subtarget->enableFlatScratch())
2801 assert(!UserSGPRInfo.hasFlatScratchInit());
2802 if ((CallConv != CallingConv::AMDGPU_CS &&
2803 CallConv != CallingConv::AMDGPU_Gfx) ||
2804 !Subtarget->hasArchitectedSGPRs())
2805 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
2806 !Info->hasWorkGroupIDZ());
2807 }
2808
2809 if (CallConv == CallingConv::AMDGPU_PS) {
2810 processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
2811
2812 // At least one interpolation mode must be enabled or else the GPU will
2813 // hang.
2814 //
2815 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
2816 // set PSInputAddr, the user wants to enable some bits after the compilation
2817 // based on run-time states. Since we can't know what the final PSInputEna
2818 // will look like, so we shouldn't do anything here and the user should take
2819 // responsibility for the correct programming.
2820 //
2821 // Otherwise, the following restrictions apply:
2822 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
2823 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
2824 // enabled too.
2825 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
2826 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
2827 CCInfo.AllocateReg(AMDGPU::VGPR0);
2828 CCInfo.AllocateReg(AMDGPU::VGPR1);
2829 Info->markPSInputAllocated(0);
2830 Info->markPSInputEnabled(0);
2831 }
2832 if (Subtarget->isAmdPalOS()) {
2833 // For isAmdPalOS, the user does not enable some bits after compilation
2834 // based on run-time states; the register values being generated here are
2835 // the final ones set in hardware. Therefore we need to apply the
2836 // workaround to PSInputAddr and PSInputEnable together. (The case where
2837 // a bit is set in PSInputAddr but not PSInputEnable is where the
2838 // frontend set up an input arg for a particular interpolation mode, but
2839 // nothing uses that input arg. Really we should have an earlier pass
2840 // that removes such an arg.)
2841 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
2842 if ((PsInputBits & 0x7F) == 0 ||
2843 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
2844 Info->markPSInputEnabled(llvm::countr_zero(Info->getPSInputAddr()));
2845 }
2846 } else if (IsKernel) {
2847 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
2848 } else {
2849 Splits.append(Ins.begin(), Ins.end());
2850 }
2851
2852 if (IsKernel)
2853 analyzeFormalArgumentsCompute(CCInfo, Ins);
2854
2855 if (IsEntryFunc) {
2856 allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
2857 allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
2858 if (IsKernel && Subtarget->hasKernargPreload())
2859 allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, *TRI, *Info);
2860
2861 allocateLDSKernelId(CCInfo, MF, *TRI, *Info);
2862 } else if (!IsGraphics) {
2863 // For the fixed ABI, pass workitem IDs in the last argument register.
2864 allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
2865
2866 // FIXME: Sink this into allocateSpecialInputSGPRs
2867 if (!Subtarget->enableFlatScratch())
2868 CCInfo.AllocateReg(Info->getScratchRSrcReg());
2869
2870 allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
2871 }
2872
2873 if (!IsKernel) {
2874 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
2875 CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
2876 }
2877
2879
2880 // FIXME: This is the minimum kernel argument alignment. We should improve
2881 // this to the maximum alignment of the arguments.
2882 //
2883 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
2884 // kern arg offset.
2885 const Align KernelArgBaseAlign = Align(16);
2886
2887 for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
2888 const ISD::InputArg &Arg = Ins[i];
2889 if (Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) {
2890 InVals.push_back(DAG.getUNDEF(Arg.VT));
2891 continue;
2892 }
2893
2894 CCValAssign &VA = ArgLocs[ArgIdx++];
2895 MVT VT = VA.getLocVT();
2896
2897 if (IsEntryFunc && VA.isMemLoc()) {
2898 VT = Ins[i].VT;
2899 EVT MemVT = VA.getLocVT();
2900
2901 const uint64_t Offset = VA.getLocMemOffset();
2902 Align Alignment = commonAlignment(KernelArgBaseAlign, Offset);
2903
2904 if (Arg.Flags.isByRef()) {
2905 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, Chain, Offset);
2906
2907 const GCNTargetMachine &TM =
2908 static_cast<const GCNTargetMachine &>(getTargetMachine());
2909 if (!TM.isNoopAddrSpaceCast(AMDGPUAS::CONSTANT_ADDRESS,
2910 Arg.Flags.getPointerAddrSpace())) {
2913 }
2914
2915 InVals.push_back(Ptr);
2916 continue;
2917 }
2918
2919 SDValue NewArg;
2920 if (Arg.isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
2921 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2922 // In this case the argument is packed into the previous preload SGPR.
2923 int64_t AlignDownOffset = alignDown(Offset, 4);
2924 int64_t OffsetDiff = Offset - AlignDownOffset;
2925 EVT IntVT = MemVT.changeTypeToInteger();
2926
2930 Register Reg =
2931 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
2932
2933 assert(Reg);
2934 Register VReg = MRI.getLiveInVirtReg(Reg);
2935 SDValue Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
2936
2937 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, DL, MVT::i32);
2938 SDValue Extract = DAG.getNode(ISD::SRL, DL, MVT::i32, Copy, ShiftAmt);
2939
2940 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Extract);
2941 ArgVal = DAG.getNode(ISD::BITCAST, DL, MemVT, ArgVal);
2942 NewArg = convertArgType(DAG, VT, MemVT, DL, ArgVal,
2943 Ins[i].Flags.isSExt(), &Ins[i]);
2944
2945 NewArg = DAG.getMergeValues({NewArg, Copy.getValue(1)}, DL);
2946 } else {
2950 const SmallVectorImpl<MCRegister> &PreloadRegs =
2951 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
2952
2953 SDValue Copy;
2954 if (PreloadRegs.size() == 1) {
2955 Register VReg = MRI.getLiveInVirtReg(PreloadRegs[0]);
2956 const TargetRegisterClass *RC = MRI.getRegClass(VReg);
2957 NewArg = DAG.getCopyFromReg(
2958 Chain, DL, VReg,
2960 TRI->getRegSizeInBits(*RC)));
2961
2962 } else {
2963 // If the kernarg alignment does not match the alignment of the SGPR
2964 // tuple RC that can accommodate this argument, it will be built up
2965 // via copies from from the individual SGPRs that the argument was
2966 // preloaded to.
2968 for (auto Reg : PreloadRegs) {
2969 Register VReg = MRI.getLiveInVirtReg(Reg);
2970 Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
2971 Elts.push_back(Copy);
2972 }
2973 NewArg =
2974 DAG.getBuildVector(EVT::getVectorVT(*DAG.getContext(), MVT::i32,
2975 PreloadRegs.size()),
2976 DL, Elts);
2977 }
2978
2979 SDValue CMemVT;
2980 if (VT.isScalarInteger() && VT.bitsLT(NewArg.getSimpleValueType()))
2981 CMemVT = DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewArg);
2982 else
2983 CMemVT = DAG.getBitcast(MemVT, NewArg);
2984 NewArg = convertArgType(DAG, VT, MemVT, DL, CMemVT,
2985 Ins[i].Flags.isSExt(), &Ins[i]);
2986 NewArg = DAG.getMergeValues({NewArg, Chain}, DL);
2987 }
2988 } else {
2989 NewArg =
2990 lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset,
2991 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
2992 }
2993 Chains.push_back(NewArg.getValue(1));
2994
2995 auto *ParamTy =
2996 dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
2998 ParamTy && (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
2999 ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
3000 // On SI local pointers are just offsets into LDS, so they are always
3001 // less than 16-bits. On CI and newer they could potentially be
3002 // real pointers, so we can't guarantee their size.
3003 NewArg = DAG.getNode(ISD::AssertZext, DL, NewArg.getValueType(), NewArg,
3004 DAG.getValueType(MVT::i16));
3005 }
3006
3007 InVals.push_back(NewArg);
3008 continue;
3009 } else if (!IsEntryFunc && VA.isMemLoc()) {
3010 SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
3011 InVals.push_back(Val);
3012 if (!Arg.Flags.isByVal())
3013 Chains.push_back(Val.getValue(1));
3014 continue;
3015 }
3016
3017 assert(VA.isRegLoc() && "Parameter must be in a register!");
3018
3019 Register Reg = VA.getLocReg();
3020 const TargetRegisterClass *RC = nullptr;
3021 if (AMDGPU::VGPR_32RegClass.contains(Reg))
3022 RC = &AMDGPU::VGPR_32RegClass;
3023 else if (AMDGPU::SGPR_32RegClass.contains(Reg))
3024 RC = &AMDGPU::SGPR_32RegClass;
3025 else
3026 llvm_unreachable("Unexpected register class in LowerFormalArguments!");
3027 EVT ValVT = VA.getValVT();
3028
3029 Reg = MF.addLiveIn(Reg, RC);
3030 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
3031
3032 if (Arg.Flags.isSRet()) {
3033 // The return object should be reasonably addressable.
3034
3035 // FIXME: This helps when the return is a real sret. If it is a
3036 // automatically inserted sret (i.e. CanLowerReturn returns false), an
3037 // extra copy is inserted in SelectionDAGBuilder which obscures this.
3038 unsigned NumBits
3040 Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
3041 DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
3042 }
3043
3044 // If this is an 8 or 16-bit value, it is really passed promoted
3045 // to 32 bits. Insert an assert[sz]ext to capture this, then
3046 // truncate to the right size.
3047 switch (VA.getLocInfo()) {
3048 case CCValAssign::Full:
3049 break;
3050 case CCValAssign::BCvt:
3051 Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val);
3052 break;
3053 case CCValAssign::SExt:
3054 Val = DAG.getNode(ISD::AssertSext, DL, VT, Val,
3055 DAG.getValueType(ValVT));
3056 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3057 break;
3058 case CCValAssign::ZExt:
3059 Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
3060 DAG.getValueType(ValVT));
3061 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3062 break;
3063 case CCValAssign::AExt:
3064 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3065 break;
3066 default:
3067 llvm_unreachable("Unknown loc info!");
3068 }
3069
3070 InVals.push_back(Val);
3071 }
3072
3073 // Start adding system SGPRs.
3074 if (IsEntryFunc)
3075 allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
3076
3077 auto &ArgUsageInfo =
3079 ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
3080
3081 unsigned StackArgSize = CCInfo.getStackSize();
3082 Info->setBytesInStackArgArea(StackArgSize);
3083
3084 return Chains.empty() ? Chain :
3085 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
3086}
3087
3088// TODO: If return values can't fit in registers, we should return as many as
3089// possible in registers before passing on stack.
3091 CallingConv::ID CallConv,
3092 MachineFunction &MF, bool IsVarArg,
3094 LLVMContext &Context) const {
3095 // Replacing returns with sret/stack usage doesn't make sense for shaders.
3096 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
3097 // for shaders. Vector types should be explicitly handled by CC.
3098 if (AMDGPU::isEntryFunctionCC(CallConv))
3099 return true;
3100
3102 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3103 if (!CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg)))
3104 return false;
3105
3106 // We must use the stack if return would require unavailable registers.
3107 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3108 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
3109 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3110 if (CCInfo.isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3111 return false;
3112
3113 return true;
3114}
3115
3116SDValue
3118 bool isVarArg,
3120 const SmallVectorImpl<SDValue> &OutVals,
3121 const SDLoc &DL, SelectionDAG &DAG) const {
3124
3125 if (AMDGPU::isKernel(CallConv)) {
3126 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
3127 OutVals, DL, DAG);
3128 }
3129
3130 bool IsShader = AMDGPU::isShader(CallConv);
3131
3132 Info->setIfReturnsVoid(Outs.empty());
3133 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3134
3135 // CCValAssign - represent the assignment of the return value to a location.
3138
3139 // CCState - Info about the registers and stack slots.
3140 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3141 *DAG.getContext());
3142
3143 // Analyze outgoing return values.
3144 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3145
3146 SDValue Glue;
3148 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
3149
3150 // Copy the result values into the output registers.
3151 for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
3152 ++I, ++RealRVLocIdx) {
3153 CCValAssign &VA = RVLocs[I];
3154 assert(VA.isRegLoc() && "Can only return in registers!");
3155 // TODO: Partially return in registers if return values don't fit.
3156 SDValue Arg = OutVals[RealRVLocIdx];
3157
3158 // Copied from other backends.
3159 switch (VA.getLocInfo()) {
3160 case CCValAssign::Full:
3161 break;
3162 case CCValAssign::BCvt:
3163 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3164 break;
3165 case CCValAssign::SExt:
3166 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3167 break;
3168 case CCValAssign::ZExt:
3169 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3170 break;
3171 case CCValAssign::AExt:
3172 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3173 break;
3174 default:
3175 llvm_unreachable("Unknown loc info!");
3176 }
3177
3178 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Glue);
3179 Glue = Chain.getValue(1);
3180 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3181 }
3182
3183 // FIXME: Does sret work properly?
3184 if (!Info->isEntryFunction()) {
3185 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3186 const MCPhysReg *I =
3187 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3188 if (I) {
3189 for (; *I; ++I) {
3190 if (AMDGPU::SReg_64RegClass.contains(*I))
3191 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
3192 else if (AMDGPU::SReg_32RegClass.contains(*I))
3193 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
3194 else
3195 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3196 }
3197 }
3198 }
3199
3200 // Update chain and glue.
3201 RetOps[0] = Chain;
3202 if (Glue.getNode())
3203 RetOps.push_back(Glue);
3204
3205 unsigned Opc = AMDGPUISD::ENDPGM;
3206 if (!IsWaveEnd)
3208 return DAG.getNode(Opc, DL, MVT::Other, RetOps);
3209}
3210
3212 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool IsVarArg,
3213 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3214 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
3215 SDValue ThisVal) const {
3216 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
3217
3218 // Assign locations to each value returned by this call.
3220 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
3221 *DAG.getContext());
3222 CCInfo.AnalyzeCallResult(Ins, RetCC);
3223
3224 // Copy all of the result registers out of their specified physreg.
3225 for (unsigned i = 0; i != RVLocs.size(); ++i) {
3226 CCValAssign VA = RVLocs[i];
3227 SDValue Val;
3228
3229 if (VA.isRegLoc()) {
3230 Val = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
3231 Chain = Val.getValue(1);
3232 InGlue = Val.getValue(2);
3233 } else if (VA.isMemLoc()) {
3234 report_fatal_error("TODO: return values in memory");
3235 } else
3236 llvm_unreachable("unknown argument location type");
3237
3238 switch (VA.getLocInfo()) {
3239 case CCValAssign::Full:
3240 break;
3241 case CCValAssign::BCvt:
3242 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
3243 break;
3244 case CCValAssign::ZExt:
3245 Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
3246 DAG.getValueType(VA.getValVT()));
3247 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3248 break;
3249 case CCValAssign::SExt:
3250 Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
3251 DAG.getValueType(VA.getValVT()));
3252 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3253 break;
3254 case CCValAssign::AExt:
3255 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3256 break;
3257 default:
3258 llvm_unreachable("Unknown loc info!");
3259 }
3260
3261 InVals.push_back(Val);
3262 }
3263
3264 return Chain;
3265}
3266
3267// Add code to pass special inputs required depending on used features separate
3268// from the explicit user arguments present in the IR.
3270 CallLoweringInfo &CLI,
3271 CCState &CCInfo,
3272 const SIMachineFunctionInfo &Info,
3273 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
3274 SmallVectorImpl<SDValue> &MemOpChains,
3275 SDValue Chain) const {
3276 // If we don't have a call site, this was a call inserted by
3277 // legalization. These can never use special inputs.
3278 if (!CLI.CB)
3279 return;
3280
3281 SelectionDAG &DAG = CLI.DAG;
3282 const SDLoc &DL = CLI.DL;
3283 const Function &F = DAG.getMachineFunction().getFunction();
3284
3285 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3286 const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
3287
3288 const AMDGPUFunctionArgInfo *CalleeArgInfo
3290 if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) {
3291 auto &ArgUsageInfo =
3293 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
3294 }
3295
3296 // TODO: Unify with private memory register handling. This is complicated by
3297 // the fact that at least in kernels, the input argument is not necessarily
3298 // in the same location as the input.
3299 static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue,
3301 {AMDGPUFunctionArgInfo::DISPATCH_PTR, "amdgpu-no-dispatch-ptr"},
3302 {AMDGPUFunctionArgInfo::QUEUE_PTR, "amdgpu-no-queue-ptr" },
3303 {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, "amdgpu-no-implicitarg-ptr"},
3304 {AMDGPUFunctionArgInfo::DISPATCH_ID, "amdgpu-no-dispatch-id"},
3305 {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, "amdgpu-no-workgroup-id-x"},
3306 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,"amdgpu-no-workgroup-id-y"},
3307 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,"amdgpu-no-workgroup-id-z"},
3308 {AMDGPUFunctionArgInfo::LDS_KERNEL_ID,"amdgpu-no-lds-kernel-id"},
3309 };
3310
3311 for (auto Attr : ImplicitAttrs) {
3312 const ArgDescriptor *OutgoingArg;
3313 const TargetRegisterClass *ArgRC;
3314 LLT ArgTy;
3315
3316 AMDGPUFunctionArgInfo::PreloadedValue InputID = Attr.first;
3317
3318 // If the callee does not use the attribute value, skip copying the value.
3319 if (CLI.CB->hasFnAttr(Attr.second))
3320 continue;
3321
3322 std::tie(OutgoingArg, ArgRC, ArgTy) =
3323 CalleeArgInfo->getPreloadedValue(InputID);
3324 if (!OutgoingArg)
3325 continue;
3326
3327 const ArgDescriptor *IncomingArg;
3328 const TargetRegisterClass *IncomingArgRC;
3329 LLT Ty;
3330 std::tie(IncomingArg, IncomingArgRC, Ty) =
3331 CallerArgInfo.getPreloadedValue(InputID);
3332 assert(IncomingArgRC == ArgRC);
3333
3334 // All special arguments are ints for now.
3335 EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3336 SDValue InputReg;
3337
3338 if (IncomingArg) {
3339 InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
3340 } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
3341 // The implicit arg ptr is special because it doesn't have a corresponding
3342 // input for kernels, and is computed from the kernarg segment pointer.
3343 InputReg = getImplicitArgPtr(DAG, DL);
3344 } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {
3345 std::optional<uint32_t> Id =
3347 if (Id.has_value()) {
3348 InputReg = DAG.getConstant(*Id, DL, ArgVT);
3349 } else {
3350 InputReg = DAG.getUNDEF(ArgVT);
3351 }
3352 } else {
3353 // We may have proven the input wasn't needed, although the ABI is
3354 // requiring it. We just need to allocate the register appropriately.
3355 InputReg = DAG.getUNDEF(ArgVT);
3356 }
3357
3358 if (OutgoingArg->isRegister()) {
3359 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3360 if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
3361 report_fatal_error("failed to allocate implicit input argument");
3362 } else {
3363 unsigned SpecialArgOffset =
3364 CCInfo.AllocateStack(ArgVT.getStoreSize(), Align(4));
3365 SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
3366 SpecialArgOffset);
3367 MemOpChains.push_back(ArgStore);
3368 }
3369 }
3370
3371 // Pack workitem IDs into a single register or pass it as is if already
3372 // packed.
3373 const ArgDescriptor *OutgoingArg;
3374 const TargetRegisterClass *ArgRC;
3375 LLT Ty;
3376
3377 std::tie(OutgoingArg, ArgRC, Ty) =
3379 if (!OutgoingArg)
3380 std::tie(OutgoingArg, ArgRC, Ty) =
3382 if (!OutgoingArg)
3383 std::tie(OutgoingArg, ArgRC, Ty) =
3385 if (!OutgoingArg)
3386 return;
3387
3388 const ArgDescriptor *IncomingArgX = std::get<0>(
3390 const ArgDescriptor *IncomingArgY = std::get<0>(
3392 const ArgDescriptor *IncomingArgZ = std::get<0>(
3394
3395 SDValue InputReg;
3396 SDLoc SL;
3397
3398 const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-x");
3399 const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-y");
3400 const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-z");
3401
3402 // If incoming ids are not packed we need to pack them.
3403 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX &&
3404 NeedWorkItemIDX) {
3405 if (Subtarget->getMaxWorkitemID(F, 0) != 0) {
3406 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
3407 } else {
3408 InputReg = DAG.getConstant(0, DL, MVT::i32);
3409 }
3410 }
3411
3412 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY &&
3413 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(F, 1) != 0) {
3414 SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY);
3415 Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y,
3416 DAG.getShiftAmountConstant(10, MVT::i32, SL));
3417 InputReg = InputReg.getNode() ?
3418 DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y) : Y;
3419 }
3420
3421 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ &&
3422 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(F, 2) != 0) {
3423 SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ);
3424 Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z,
3425 DAG.getShiftAmountConstant(20, MVT::i32, SL));
3426 InputReg = InputReg.getNode() ?
3427 DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z) : Z;
3428 }
3429
3430 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3431 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3432 // We're in a situation where the outgoing function requires the workitem
3433 // ID, but the calling function does not have it (e.g a graphics function
3434 // calling a C calling convention function). This is illegal, but we need
3435 // to produce something.
3436 InputReg = DAG.getUNDEF(MVT::i32);
3437 } else {
3438 // Workitem ids are already packed, any of present incoming arguments
3439 // will carry all required fields.
3441 IncomingArgX ? *IncomingArgX :
3442 IncomingArgY ? *IncomingArgY :
3443 *IncomingArgZ, ~0u);
3444 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg);
3445 }
3446 }
3447
3448 if (OutgoingArg->isRegister()) {
3449 if (InputReg)
3450 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3451
3452 CCInfo.AllocateReg(OutgoingArg->getRegister());
3453 } else {
3454 unsigned SpecialArgOffset = CCInfo.AllocateStack(4, Align(4));
3455 if (InputReg) {
3456 SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
3457 SpecialArgOffset);
3458 MemOpChains.push_back(ArgStore);
3459 }
3460 }
3461}
3462
3464 return CC == CallingConv::Fast;
3465}
3466
3467/// Return true if we might ever do TCO for calls with this calling convention.
3469 switch (CC) {
3470 case CallingConv::C:
3472 return true;
3473 default:
3474 return canGuaranteeTCO(CC);
3475 }
3476}
3477
3479 SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
3481 const SmallVectorImpl<SDValue> &OutVals,
3482 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3483 if (AMDGPU::isChainCC(CalleeCC))
3484 return true;
3485
3486 if (!mayTailCallThisCC(CalleeCC))
3487 return false;
3488
3489 // For a divergent call target, we need to do a waterfall loop over the
3490 // possible callees which precludes us from using a simple jump.
3491 if (Callee->isDivergent())
3492 return false;
3493
3495 const Function &CallerF = MF.getFunction();
3496 CallingConv::ID CallerCC = CallerF.getCallingConv();
3498 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3499
3500 // Kernels aren't callable, and don't have a live in return address so it
3501 // doesn't make sense to do a tail call with entry functions.
3502 if (!CallerPreserved)
3503 return false;
3504
3505 bool CCMatch = CallerCC == CalleeCC;
3506
3508 if (canGuaranteeTCO(CalleeCC) && CCMatch)
3509 return true;
3510 return false;
3511 }
3512
3513 // TODO: Can we handle var args?
3514 if (IsVarArg)
3515 return false;
3516
3517 for (const Argument &Arg : CallerF.args()) {
3518 if (Arg.hasByValAttr())
3519 return false;
3520 }
3521
3522 LLVMContext &Ctx = *DAG.getContext();
3523
3524 // Check that the call results are passed in the same way.
3525 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
3526 CCAssignFnForCall(CalleeCC, IsVarArg),
3527 CCAssignFnForCall(CallerCC, IsVarArg)))
3528 return false;
3529
3530 // The callee has to preserve all registers the caller needs to preserve.
3531 if (!CCMatch) {
3532 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
3533 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3534 return false;
3535 }
3536
3537 // Nothing more to check if the callee is taking no arguments.
3538 if (Outs.empty())
3539 return true;
3540
3542 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
3543
3544 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
3545
3546 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
3547 // If the stack arguments for this call do not fit into our own save area then
3548 // the call cannot be made tail.
3549 // TODO: Is this really necessary?
3550 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
3551 return false;
3552
3553 const MachineRegisterInfo &MRI = MF.getRegInfo();
3554 return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
3555}
3556
3558 if (!CI->isTailCall())
3559 return false;
3560
3561 const Function *ParentFn = CI->getParent()->getParent();
3563 return false;
3564 return true;
3565}
3566
3567// The wave scratch offset register is used as the global base pointer.
3569 SmallVectorImpl<SDValue> &InVals) const {
3570 CallingConv::ID CallConv = CLI.CallConv;
3571 bool IsChainCallConv = AMDGPU::isChainCC(CallConv);
3572
3573 SelectionDAG &DAG = CLI.DAG;
3574
3575 TargetLowering::ArgListEntry RequestedExec;
3576 if (IsChainCallConv) {
3577 // The last argument should be the value that we need to put in EXEC.
3578 // Pop it out of CLI.Outs and CLI.OutVals before we do any processing so we
3579 // don't treat it like the rest of the arguments.
3580 RequestedExec = CLI.Args.back();
3581 assert(RequestedExec.Node && "No node for EXEC");
3582
3583 if (!RequestedExec.Ty->isIntegerTy(Subtarget->getWavefrontSize()))
3584 return lowerUnhandledCall(CLI, InVals, "Invalid value for EXEC");
3585
3586 assert(CLI.Outs.back().OrigArgIndex == 2 && "Unexpected last arg");
3587 CLI.Outs.pop_back();
3588 CLI.OutVals.pop_back();
3589
3590 if (RequestedExec.Ty->isIntegerTy(64)) {
3591 assert(CLI.Outs.back().OrigArgIndex == 2 && "Exec wasn't split up");
3592 CLI.Outs.pop_back();
3593 CLI.OutVals.pop_back();
3594 }
3595
3596 assert(CLI.Outs.back().OrigArgIndex != 2 &&
3597 "Haven't popped all the pieces of the EXEC mask");
3598 }
3599
3600 const SDLoc &DL = CLI.DL;
3602 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
3604 SDValue Chain = CLI.Chain;
3605 SDValue Callee = CLI.Callee;
3606 bool &IsTailCall = CLI.IsTailCall;
3607 bool IsVarArg = CLI.IsVarArg;
3608 bool IsSibCall = false;
3610
3611 if (Callee.isUndef() || isNullConstant(Callee)) {
3612 if (!CLI.IsTailCall) {
3613 for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
3614 InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT));
3615 }
3616
3617 return Chain;
3618 }
3619
3620 if (IsVarArg) {
3621 return lowerUnhandledCall(CLI, InVals,
3622 "unsupported call to variadic function ");
3623 }
3624
3625 if (!CLI.CB)
3626 report_fatal_error("unsupported libcall legalization");
3627
3628 if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
3629 return lowerUnhandledCall(CLI, InVals,
3630 "unsupported required tail call to function ");
3631 }
3632
3633 if (IsTailCall) {
3635 Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
3636 if (!IsTailCall &&
3637 ((CLI.CB && CLI.CB->isMustTailCall()) || IsChainCallConv)) {
3638 report_fatal_error("failed to perform tail call elimination on a call "
3639 "site marked musttail or on llvm.amdgcn.cs.chain");
3640 }
3641
3642 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
3643
3644 // A sibling call is one where we're under the usual C ABI and not planning
3645 // to change that but can still do a tail call:
3646 if (!TailCallOpt && IsTailCall)
3647 IsSibCall = true;
3648
3649 if (IsTailCall)
3650 ++NumTailCalls;
3651 }
3652
3655 SmallVector<SDValue, 8> MemOpChains;
3656
3657 // Analyze operands of the call, assigning locations to each operand.
3659 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
3660 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
3661
3662 if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv)) {
3663 // With a fixed ABI, allocate fixed registers before user arguments.
3664 passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
3665 }
3666
3667 CCInfo.AnalyzeCallOperands(Outs, AssignFn);
3668
3669 // Get a count of how many bytes are to be pushed on the stack.
3670 unsigned NumBytes = CCInfo.getStackSize();
3671
3672 if (IsSibCall) {
3673 // Since we're not changing the ABI to make this a tail call, the memory
3674 // operands are already available in the caller's incoming argument space.
3675 NumBytes = 0;
3676 }
3677
3678 // FPDiff is the byte offset of the call's argument area from the callee's.
3679 // Stores to callee stack arguments will be placed in FixedStackSlots offset
3680 // by this amount for a tail call. In a sibling call it must be 0 because the
3681 // caller will deallocate the entire stack and the callee still expects its
3682 // arguments to begin at SP+0. Completely unused for non-tail calls.
3683 int32_t FPDiff = 0;
3684 MachineFrameInfo &MFI = MF.getFrameInfo();
3685
3686 // Adjust the stack pointer for the new arguments...
3687 // These operations are automatically eliminated by the prolog/epilog pass
3688 if (!IsSibCall)
3689 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
3690
3691 if (!IsSibCall || IsChainCallConv) {
3692 if (!Subtarget->enableFlatScratch()) {
3693 SmallVector<SDValue, 4> CopyFromChains;
3694
3695 // In the HSA case, this should be an identity copy.
3696 SDValue ScratchRSrcReg
3697 = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
3698 RegsToPass.emplace_back(IsChainCallConv
3699 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
3700 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
3701 ScratchRSrcReg);
3702 CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
3703 Chain = DAG.getTokenFactor(DL, CopyFromChains);
3704 }
3705 }
3706
3707 MVT PtrVT = MVT::i32;
3708
3709 // Walk the register/memloc assignments, inserting copies/loads.
3710 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3711 CCValAssign &VA = ArgLocs[i];
3712 SDValue Arg = OutVals[i];
3713
3714 // Promote the value if needed.
3715 switch (VA.getLocInfo()) {
3716 case CCValAssign::Full:
3717 break;
3718 case CCValAssign::BCvt:
3719 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3720 break;
3721 case CCValAssign::ZExt:
3722 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3723 break;
3724 case CCValAssign::SExt:
3725 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3726 break;
3727 case CCValAssign::AExt:
3728 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3729 break;
3730 case CCValAssign::FPExt:
3731 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
3732 break;
3733 default:
3734 llvm_unreachable("Unknown loc info!");
3735 }
3736
3737 if (VA.isRegLoc()) {
3738 RegsToPass.push_back(std::pair(VA.getLocReg(), Arg));
3739 } else {
3740 assert(VA.isMemLoc());
3741
3742 SDValue DstAddr;
3743 MachinePointerInfo DstInfo;
3744
3745 unsigned LocMemOffset = VA.getLocMemOffset();
3746 int32_t Offset = LocMemOffset;
3747
3748 SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
3749 MaybeAlign Alignment;
3750
3751 if (IsTailCall) {
3752 ISD::ArgFlagsTy Flags = Outs[i].Flags;
3753 unsigned OpSize = Flags.isByVal() ?
3754 Flags.getByValSize() : VA.getValVT().getStoreSize();
3755
3756 // FIXME: We can have better than the minimum byval required alignment.
3757 Alignment =
3758 Flags.isByVal()
3759 ? Flags.getNonZeroByValAlign()
3760 : commonAlignment(Subtarget->getStackAlignment(), Offset);
3761
3762 Offset = Offset + FPDiff;
3763 int FI = MFI.CreateFixedObject(OpSize, Offset, true);
3764
3765 DstAddr = DAG.getFrameIndex(FI, PtrVT);
3766 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
3767
3768 // Make sure any stack arguments overlapping with where we're storing
3769 // are loaded before this eventual operation. Otherwise they'll be
3770 // clobbered.
3771
3772 // FIXME: Why is this really necessary? This seems to just result in a
3773 // lot of code to copy the stack and write them back to the same
3774 // locations, which are supposed to be immutable?
3775 Chain = addTokenForArgument(Chain, DAG, MFI, FI);
3776 } else {
3777 // Stores to the argument stack area are relative to the stack pointer.
3778 SDValue SP = DAG.getCopyFromReg(Chain, DL, Info->getStackPtrOffsetReg(),
3779 MVT::i32);
3780 DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, SP, PtrOff);
3781 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
3782 Alignment =
3783 commonAlignment(Subtarget->getStackAlignment(), LocMemOffset);
3784 }
3785
3786 if (Outs[i].Flags.isByVal()) {
3787 SDValue SizeNode =
3788 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
3789 SDValue Cpy =
3790 DAG.getMemcpy(Chain, DL, DstAddr, Arg, SizeNode,
3791 Outs[i].Flags.getNonZeroByValAlign(),
3792 /*isVol = */ false, /*AlwaysInline = */ true,
3793 /*isTailCall = */ false, DstInfo,
3795
3796 MemOpChains.push_back(Cpy);
3797 } else {
3798 SDValue Store =
3799 DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Alignment);
3800 MemOpChains.push_back(Store);
3801 }
3802 }
3803 }
3804
3805 if (!MemOpChains.empty())
3806 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
3807
3808 // Build a sequence of copy-to-reg nodes chained together with token chain
3809 // and flag operands which copy the outgoing args into the appropriate regs.
3810 SDValue InGlue;
3811 for (auto &RegToPass : RegsToPass) {
3812 Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
3813 RegToPass.second, InGlue);
3814 InGlue = Chain.getValue(1);
3815 }
3816
3817
3818 // We don't usually want to end the call-sequence here because we would tidy
3819 // the frame up *after* the call, however in the ABI-changing tail-call case
3820 // we've carefully laid out the parameters so that when sp is reset they'll be
3821 // in the correct location.
3822 if (IsTailCall && !IsSibCall) {
3823 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, 0, InGlue, DL);
3824 InGlue = Chain.getValue(1);
3825 }
3826
3827 std::vector<SDValue> Ops;
3828 Ops.push_back(Chain);
3829 Ops.push_back(Callee);
3830 // Add a redundant copy of the callee global which will not be legalized, as
3831 // we need direct access to the callee later.
3832 if (GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(Callee)) {
3833 const GlobalValue *GV = GSD->getGlobal();
3834 Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
3835 } else {
3836 Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
3837 }
3838
3839 if (IsTailCall) {
3840 // Each tail call may have to adjust the stack by a different amount, so
3841 // this information must travel along with the operation for eventual
3842 // consumption by emitEpilogue.
3843 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
3844 }
3845
3846 if (IsChainCallConv)
3847 Ops.push_back(RequestedExec.Node);
3848
3849 // Add argument registers to the end of the list so that they are known live
3850 // into the call.
3851 for (auto &RegToPass : RegsToPass) {
3852 Ops.push_back(DAG.getRegister(RegToPass.first,
3853 RegToPass.second.getValueType()));
3854 }
3855
3856 // Add a register mask operand representing the call-preserved registers.
3857 auto *TRI = static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
3858 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
3859 assert(Mask && "Missing call preserved mask for calling convention");
3860 Ops.push_back(DAG.getRegisterMask(Mask));
3861
3862 if (InGlue.getNode())
3863 Ops.push_back(InGlue);
3864
3865 // NOTE: This potentially results in *two* glue operands, and the wrong one
3866 // might possibly show up where the other was intended. In particular,
3867 // Emitter::EmitMachineNode() expects only the glued convergence token if it
3868 // exists. Similarly, the selection of the call expects to match only the
3869 // InGlue operand if it exists.
3870 if (SDValue Token = CLI.ConvergenceControlToken) {
3871 Ops.push_back(SDValue(DAG.getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE,
3872 DL, MVT::Glue, Token),
3873 0));
3874 }
3875
3876 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3877
3878 // If we're doing a tall call, use a TC_RETURN here rather than an
3879 // actual call instruction.
3880 if (IsTailCall) {
3881 MFI.setHasTailCall();
3882 unsigned OPC = AMDGPUISD::TC_RETURN;
3883 switch (CallConv) {
3886 break;
3890 break;
3891 }
3892
3893 return DAG.getNode(OPC, DL, NodeTys, Ops);
3894 }
3895
3896 // Returns a chain and a flag for retval copy to use.
3897 SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, NodeTys, Ops);
3898 Chain = Call.getValue(0);
3899 InGlue = Call.getValue(1);
3900
3901 uint64_t CalleePopBytes = NumBytes;
3902 Chain = DAG.getCALLSEQ_END(Chain, 0, CalleePopBytes, InGlue, DL);
3903 if (!Ins.empty())
3904 InGlue = Chain.getValue(1);
3905
3906 // Handle result values, copying them out of physregs into vregs that we
3907 // return.
3908 return LowerCallResult(Chain, InGlue, CallConv, IsVarArg, Ins, DL, DAG,
3909 InVals, /*IsThisReturn=*/false, SDValue());
3910}
3911
3912// This is identical to the default implementation in ExpandDYNAMIC_STACKALLOC,
3913// except for applying the wave size scale to the increment amount.
3915 SDValue Op, SelectionDAG &DAG) const {
3916 const MachineFunction &MF = DAG.getMachineFunction();
3918
3919 SDLoc dl(Op);
3920 EVT VT = Op.getValueType();
3921 SDValue Tmp1 = Op;
3922 SDValue Tmp2 = Op.getValue(1);
3923 SDValue Tmp3 = Op.getOperand(2);
3924 SDValue Chain = Tmp1.getOperand(0);
3925
3926 Register SPReg = Info->getStackPtrOffsetReg();
3927
3928 // Chain the dynamic stack allocation so that it doesn't modify the stack
3929 // pointer when other instructions are using the stack.
3930 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
3931
3932 SDValue Size = Tmp2.getOperand(1);
3933 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
3934 Chain = SP.getValue(1);
3935 MaybeAlign Alignment = cast<ConstantSDNode>(Tmp3)->getMaybeAlignValue();
3936 const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
3937 unsigned Opc =
3940
3941 SDValue ScaledSize = DAG.getNode(
3942 ISD::SHL, dl, VT, Size,
3943 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
3944
3945 Align StackAlign = TFL->getStackAlign();
3946 Tmp1 = DAG.getNode(Opc, dl, VT, SP, ScaledSize); // Value
3947 if (Alignment && *Alignment > StackAlign) {
3948 Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1,
3949 DAG.getConstant(-(uint64_t)Alignment->value()
3950 << Subtarget->getWavefrontSizeLog2(),
3951 dl, VT));
3952 }
3953
3954 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain
3955 Tmp2 = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
3956
3957 return DAG.getMergeValues({Tmp1, Tmp2}, dl);
3958}
3959
3961 SelectionDAG &DAG) const {
3962 // We only handle constant sizes here to allow non-entry block, static sized
3963 // allocas. A truly dynamic value is more difficult to support because we
3964 // don't know if the size value is uniform or not. If the size isn't uniform,
3965 // we would need to do a wave reduction to get the maximum size to know how
3966 // much to increment the uniform stack pointer.
3967 SDValue Size = Op.getOperand(1);
3968 if (isa<ConstantSDNode>(Size))
3969 return lowerDYNAMIC_STACKALLOCImpl(Op, DAG); // Use "generic" expansion.
3970
3972}
3973
3975 if (Op.getValueType() != MVT::i32)
3976 return Op; // Defer to cannot select error.
3977
3979 SDLoc SL(Op);
3980
3981 SDValue CopyFromSP = DAG.getCopyFromReg(Op->getOperand(0), SL, SP, MVT::i32);
3982
3983 // Convert from wave uniform to swizzled vector address. This should protect
3984 // from any edge cases where the stacksave result isn't directly used with
3985 // stackrestore.
3986 SDValue VectorAddress =
3987 DAG.getNode(AMDGPUISD::WAVE_ADDRESS, SL, MVT::i32, CopyFromSP);
3988 return DAG.getMergeValues({VectorAddress, CopyFromSP.getValue(1)}, SL);
3989}
3990
3992 SelectionDAG &DAG) const {
3993 SDLoc SL(Op);
3994 assert(Op.getValueType() == MVT::i32);
3995
3996 uint32_t BothRoundHwReg =
3998 SDValue GetRoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
3999
4000 SDValue IntrinID =
4001 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4002 SDValue GetReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, Op->getVTList(),
4003 Op.getOperand(0), IntrinID, GetRoundBothImm);
4004
4005 // There are two rounding modes, one for f32 and one for f64/f16. We only
4006 // report in the standard value range if both are the same.
4007 //
4008 // The raw values also differ from the expected FLT_ROUNDS values. Nearest
4009 // ties away from zero is not supported, and the other values are rotated by
4010 // 1.
4011 //
4012 // If the two rounding modes are not the same, report a target defined value.
4013
4014 // Mode register rounding mode fields:
4015 //
4016 // [1:0] Single-precision round mode.
4017 // [3:2] Double/Half-precision round mode.
4018 //
4019 // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero.
4020 //
4021 // Hardware Spec
4022 // Toward-0 3 0
4023 // Nearest Even 0 1
4024 // +Inf 1 2
4025 // -Inf 2 3
4026 // NearestAway0 N/A 4
4027 //
4028 // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit
4029 // table we can index by the raw hardware mode.
4030 //
4031 // (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf
4032
4033 SDValue BitTable =
4035
4036 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4037 SDValue RoundModeTimesNumBits =
4038 DAG.getNode(ISD::SHL, SL, MVT::i32, GetReg, Two);
4039
4040 // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we
4041 // knew only one mode was demanded.
4042 SDValue TableValue =
4043 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4044 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4045
4046 SDValue EntryMask = DAG.getConstant(0xf, SL, MVT::i32);
4047 SDValue TableEntry =
4048 DAG.getNode(ISD::AND, SL, MVT::i32, TruncTable, EntryMask);
4049
4050 // There's a gap in the 4-bit encoded table and actual enum values, so offset
4051 // if it's an extended value.
4052 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4053 SDValue IsStandardValue =
4054 DAG.getSetCC(SL, MVT::i1, TableEntry, Four, ISD::SETULT);
4055 SDValue EnumOffset = DAG.getNode(ISD::ADD, SL, MVT::i32, TableEntry, Four);
4056 SDValue Result = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue,
4057 TableEntry, EnumOffset);
4058
4059 return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL);
4060}
4061
4063 if (Op->isDivergent())
4064 return SDValue();
4065
4066 switch (cast<MemSDNode>(Op)->getAddressSpace()) {
4071 break;
4072 default:
4073 return SDValue();
4074 }
4075
4076 return Op;
4077}
4078
4079// Work around DAG legality rules only based on the result type.
4081 bool IsStrict = Op.getOpcode() == ISD::STRICT_FP_EXTEND;
4082 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
4083 EVT SrcVT = Src.getValueType();
4084
4085 if (SrcVT.getScalarType() != MVT::bf16)
4086 return Op;
4087
4088 SDLoc SL(Op);
4089 SDValue BitCast =
4090 DAG.getNode(ISD::BITCAST, SL, SrcVT.changeTypeToInteger(), Src);
4091
4092 EVT DstVT = Op.getValueType();
4093 if (IsStrict)
4094 llvm_unreachable("Need STRICT_BF16_TO_FP");
4095
4096 return DAG.getNode(ISD::BF16_TO_FP, SL, DstVT, BitCast);
4097}
4098
4100 SDLoc SL(Op);
4101 if (Op.getValueType() != MVT::i64)
4102 return Op;
4103
4104 uint32_t ModeHwReg =
4106 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4107 uint32_t TrapHwReg =
4109 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4110
4111 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::Other);
4112 SDValue IntrinID =
4113 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4114 SDValue GetModeReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4115 Op.getOperand(0), IntrinID, ModeHwRegImm);
4116 SDValue GetTrapReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4117 Op.getOperand(0), IntrinID, TrapHwRegImm);
4118 SDValue TokenReg =
4119 DAG.getNode(ISD::TokenFactor, SL, MVT::Other, GetModeReg.getValue(1),
4120 GetTrapReg.getValue(1));
4121
4122 SDValue CvtPtr =
4123 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, GetModeReg, GetTrapReg);
4124 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
4125
4126 return DAG.getMergeValues({Result, TokenReg}, SL);
4127}
4128
4130 SDLoc SL(Op);
4131 if (Op.getOperand(1).getValueType() != MVT::i64)
4132 return Op;
4133
4134 SDValue Input = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op.getOperand(1));
4135 SDValue NewModeReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4136 DAG.getConstant(0, SL, MVT::i32));
4137 SDValue NewTrapReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4138 DAG.getConstant(1, SL, MVT::i32));
4139
4140 SDValue ReadFirstLaneID =
4141 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4142 NewModeReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4143 ReadFirstLaneID, NewModeReg);
4144 NewTrapReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4145 ReadFirstLaneID, NewTrapReg);
4146
4147 unsigned ModeHwReg =
4149 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4150 unsigned TrapHwReg =
4152 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4153
4154 SDValue IntrinID =
4155 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4156 SDValue SetModeReg =
4157 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4158 IntrinID, ModeHwRegImm, NewModeReg);
4159 SDValue SetTrapReg =
4160 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4161 IntrinID, TrapHwRegImm, NewTrapReg);
4162 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, SetTrapReg, SetModeReg);
4163}
4164
4166 const MachineFunction &MF) const {
4168 .Case("m0", AMDGPU::M0)
4169 .Case("exec", AMDGPU::EXEC)
4170 .Case("exec_lo", AMDGPU::EXEC_LO)
4171 .Case("exec_hi", AMDGPU::EXEC_HI)
4172 .Case("flat_scratch", AMDGPU::FLAT_SCR)
4173 .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4174 .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4175 .Default(Register());
4176
4177 if (Reg == AMDGPU::NoRegister) {
4178 report_fatal_error(Twine("invalid register name \""
4179 + StringRef(RegName) + "\"."));
4180
4181 }
4182
4183 if (!Subtarget->hasFlatScrRegister() &&
4184 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
4185 report_fatal_error(Twine("invalid register \""
4186 + StringRef(RegName) + "\" for subtarget."));
4187 }
4188
4189 switch (Reg) {
4190 case AMDGPU::M0:
4191 case AMDGPU::EXEC_LO:
4192 case AMDGPU::EXEC_HI:
4193 case AMDGPU::FLAT_SCR_LO:
4194 case AMDGPU::FLAT_SCR_HI:
4195 if (VT.getSizeInBits() == 32)
4196 return Reg;
4197 break;
4198 case AMDGPU::EXEC:
4199 case AMDGPU::FLAT_SCR:
4200 if (VT.getSizeInBits() == 64)
4201 return Reg;
4202 break;
4203 default:
4204 llvm_unreachable("missing register type checking");
4205 }
4206
4207 report_fatal_error(Twine("invalid type for register \""
4208 + StringRef(RegName) + "\"."));
4209}
4210
4211// If kill is not the last instruction, split the block so kill is always a
4212// proper terminator.
4215 MachineBasicBlock *BB) const {
4216 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
4218 MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
4219 return SplitBB;
4220}
4221
4222// Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
4223// \p MI will be the only instruction in the loop body block. Otherwise, it will
4224// be the first instruction in the remainder block.
4225//
4226/// \returns { LoopBody, Remainder }
4227static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4231
4232 // To insert the loop we need to split the block. Move everything after this
4233 // point to a new block, and insert a new empty block between the two.
4235 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
4237 ++MBBI;
4238
4239 MF->insert(MBBI, LoopBB);
4240 MF->insert(MBBI, RemainderBB);
4241
4242 LoopBB->addSuccessor(LoopBB);
4243 LoopBB->addSuccessor(RemainderBB);
4244
4245 // Move the rest of the block into a new block.
4246 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
4247
4248 if (InstInLoop) {
4249 auto Next = std::next(I);
4250
4251 // Move instruction to loop body.
4252 LoopBB->splice(LoopBB->begin(), &MBB, I, Next);
4253
4254 // Move the rest of the block.
4255 RemainderBB->splice(RemainderBB->begin(), &MBB, Next, MBB.end());
4256 } else {
4257 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
4258 }
4259
4260 MBB.addSuccessor(LoopBB);
4261
4262 return std::pair(LoopBB, RemainderBB);
4263}
4264
4265/// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
4267 MachineBasicBlock *MBB = MI.getParent();
4269 auto I = MI.getIterator();
4270 auto E = std::next(I);
4271
4272 BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
4273 .addImm(0);
4274
4275 MIBundleBuilder Bundler(*MBB, I, E);
4276 finalizeBundle(*MBB, Bundler.begin());
4277}
4278
4281 MachineBasicBlock *BB) const {
4282 const DebugLoc &DL = MI.getDebugLoc();
4283
4285
4286 MachineBasicBlock *LoopBB;
4287 MachineBasicBlock *RemainderBB;
4289
4290 // Apparently kill flags are only valid if the def is in the same block?
4291 if (MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0))
4292 Src->setIsKill(false);
4293
4294 std::tie(LoopBB, RemainderBB) = splitBlockForLoop(MI, *BB, true);
4295
4296 MachineBasicBlock::iterator I = LoopBB->end();
4297
4298 const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
4300
4301 // Clear TRAP_STS.MEM_VIOL
4302 BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
4303 .addImm(0)
4304 .addImm(EncodedReg);
4305
4307
4308 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4309
4310 // Load and check TRAP_STS.MEM_VIOL
4311 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg)
4312 .addImm(EncodedReg);
4313
4314 // FIXME: Do we need to use an isel pseudo that may clobber scc?
4315 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32))
4316 .addReg(Reg, RegState::Kill)
4317 .addImm(0);
4318 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
4319 .addMBB(LoopBB);
4320
4321 return RemainderBB;
4322}
4323
4324// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
4325// wavefront. If the value is uniform and just happens to be in a VGPR, this
4326// will only do one iteration. In the worst case, this will loop 64 times.
4327//
4328// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
4331 MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
4332 const DebugLoc &DL, const MachineOperand &Idx,
4333 unsigned InitReg, unsigned ResultReg, unsigned PhiReg,
4334 unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode,
4335 Register &SGPRIdxReg) {
4336
4337 MachineFunction *MF = OrigBB.getParent();
4338 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4339 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4341
4342 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
4343 Register PhiExec = MRI.createVirtualRegister(BoolRC);
4344 Register NewExec = MRI.createVirtualRegister(BoolRC);
4345 Register CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4346 Register CondReg = MRI.createVirtualRegister(BoolRC);
4347
4348 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
4349 .addReg(InitReg)
4350 .addMBB(&OrigBB)
4351 .addReg(ResultReg)
4352 .addMBB(&LoopBB);
4353
4354 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
4355 .addReg(InitSaveExecReg)
4356 .addMBB(&OrigBB)
4357 .addReg(NewExec)
4358 .addMBB(&LoopBB);
4359
4360 // Read the next variant <- also loop target.
4361 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
4362 .addReg(Idx.getReg(), getUndefRegState(Idx.isUndef()));
4363
4364 // Compare the just read M0 value to all possible Idx values.
4365 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
4366 .addReg(CurrentIdxReg)
4367 .addReg(Idx.getReg(), 0, Idx.getSubReg());
4368
4369 // Update EXEC, save the original EXEC value to VCC.
4370 BuildMI(LoopBB, I, DL, TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
4371 : AMDGPU::S_AND_SAVEEXEC_B64),
4372 NewExec)
4373 .addReg(CondReg, RegState::Kill);
4374
4375 MRI.setSimpleHint(NewExec, CondReg);
4376
4377 if (UseGPRIdxMode) {
4378 if (Offset == 0) {
4379 SGPRIdxReg = CurrentIdxReg;
4380 } else {
4381 SGPRIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4382 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
4383 .addReg(CurrentIdxReg, RegState::Kill)
4384 .addImm(Offset);
4385 }
4386 } else {
4387 // Move index from VCC into M0
4388 if (Offset == 0) {
4389 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
4390 .addReg(CurrentIdxReg, RegState::Kill);
4391 } else {
4392 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
4393 .addReg(CurrentIdxReg, RegState::Kill)
4394 .addImm(Offset);
4395 }
4396 }
4397
4398 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
4399 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4400 MachineInstr *InsertPt =
4401 BuildMI(LoopBB, I, DL, TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term
4402 : AMDGPU::S_XOR_B64_term), Exec)
4403 .addReg(Exec)
4404 .addReg(NewExec);
4405
4406 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
4407 // s_cbranch_scc0?
4408
4409 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
4410 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
4411 .addMBB(&LoopBB);
4412
4413 return InsertPt->getIterator();
4414}
4415
4416// This has slightly sub-optimal regalloc when the source vector is killed by
4417// the read. The register allocator does not understand that the kill is
4418// per-workitem, so is kept alive for the whole loop so we end up not re-using a
4419// subregister from it, using 1 more VGPR than necessary. This was saved when
4420// this was expanded after register allocation.
4423 unsigned InitResultReg, unsigned PhiReg, int Offset,
4424 bool UseGPRIdxMode, Register &SGPRIdxReg) {
4426 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4427 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4429 const DebugLoc &DL = MI.getDebugLoc();
4431
4432 const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
4433 Register DstReg = MI.getOperand(0).getReg();
4434 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
4435 Register TmpExec = MRI.createVirtualRegister(BoolXExecRC);
4436 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4437 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4438
4439 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
4440
4441 // Save the EXEC mask
4442 BuildMI(MBB, I, DL, TII->get(MovExecOpc), SaveExec)
4443 .addReg(Exec);
4444
4445 MachineBasicBlock *LoopBB;
4446 MachineBasicBlock *RemainderBB;
4447 std::tie(LoopBB, RemainderBB) = splitBlockForLoop(MI, MBB, false);
4448
4449 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4450
4451 auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
4452 InitResultReg, DstReg, PhiReg, TmpExec,
4453 Offset, UseGPRIdxMode, SGPRIdxReg);
4454
4455 MachineBasicBlock* LandingPad = MF->CreateMachineBasicBlock();
4457 ++MBBI;
4458 MF->insert(MBBI, LandingPad);
4459 LoopBB->removeSuccessor(RemainderBB);
4460 LandingPad->addSuccessor(RemainderBB);
4461 LoopBB->addSuccessor(LandingPad);
4462 MachineBasicBlock::iterator First = LandingPad->begin();
4463 BuildMI(*LandingPad, First, DL, TII->get(MovExecOpc), Exec)
4464 .addReg(SaveExec);
4465
4466 return InsPt;
4467}
4468
4469// Returns subreg index, offset
4470static std::pair<unsigned, int>
4472 const TargetRegisterClass *SuperRC,
4473 unsigned VecReg,
4474 int Offset) {
4475 int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
4476
4477 // Skip out of bounds offsets, or else we would end up using an undefined
4478 // register.
4479 if (Offset >= NumElts || Offset < 0)
4480 return std::pair(AMDGPU::sub0, Offset);
4481
4482 return std::pair(SIRegisterInfo::getSubRegFromChannel(Offset), 0);
4483}
4484
4487 int Offset) {
4488 MachineBasicBlock *MBB = MI.getParent();
4489 const DebugLoc &DL = MI.getDebugLoc();
4491
4492 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4493
4494 assert(Idx->getReg() != AMDGPU::NoRegister);
4495
4496 if (Offset == 0) {
4497 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0).add(*Idx);
4498 } else {
4499 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
4500 .add(*Idx)
4501 .addImm(Offset);
4502 }
4503}
4504
4507 int Offset) {
4508 MachineBasicBlock *MBB = MI.getParent();
4509 const DebugLoc &DL = MI.getDebugLoc();
4511
4512 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4513
4514 if (Offset == 0)
4515 return Idx->getReg();
4516
4517 Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4518 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
4519 .add(*Idx)
4520 .addImm(Offset);
4521 return Tmp;
4522}
4523
4526 const GCNSubtarget &ST) {
4527 const SIInstrInfo *TII = ST.getInstrInfo();
4528 const SIRegisterInfo &TRI = TII->getRegisterInfo();
4531
4532 Register Dst = MI.getOperand(0).getReg();
4533 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4534 Register SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
4535 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
4536
4537 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
4538 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
4539
4540 unsigned SubReg;
4541 std::tie(SubReg, Offset)
4542 = computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
4543
4544 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4545
4546 // Check for a SGPR index.
4547 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4549 const DebugLoc &DL = MI.getDebugLoc();
4550
4551 if (UseGPRIdxMode) {
4552 // TODO: Look at the uses to avoid the copy. This may require rescheduling
4553 // to avoid interfering with other uses, so probably requires a new
4554 // optimization pass.
4556
4557 const MCInstrDesc &GPRIDXDesc =
4558 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
4559 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
4560 .addReg(SrcReg)
4561 .addReg(Idx)
4562 .addImm(SubReg);
4563 } else {
4565
4566 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
4567 .addReg(SrcReg, 0, SubReg)
4568 .addReg(SrcReg, RegState::Implicit);
4569 }
4570
4571 MI.eraseFromParent();
4572
4573 return &MBB;
4574 }
4575
4576 // Control flow needs to be inserted if indexing with a VGPR.
4577 const DebugLoc &DL = MI.getDebugLoc();
4579
4580 Register PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4581 Register InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4582
4583 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
4584
4585 Register SGPRIdxReg;
4586 auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset,
4587 UseGPRIdxMode, SGPRIdxReg);
4588
4589 MachineBasicBlock *LoopBB = InsPt->getParent();
4590
4591 if (UseGPRIdxMode) {
4592 const MCInstrDesc &GPRIDXDesc =
4593 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
4594
4595 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
4596 .addReg(SrcReg)
4597 .addReg(SGPRIdxReg)
4598 .addImm(SubReg);
4599 } else {
4600 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
4601 .addReg(SrcReg, 0, SubReg)
4602 .addReg(SrcReg, RegState::Implicit);
4603 }
4604
4605 MI.eraseFromParent();
4606
4607 return LoopBB;
4608}
4609
4612 const GCNSubtarget &ST) {
4613 const SIInstrInfo *TII = ST.getInstrInfo();
4614 const SIRegisterInfo &TRI = TII->getRegisterInfo();
4617
4618 Register Dst = MI.getOperand(0).getReg();
4619 const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
4620 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4621 const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
4622 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
4623 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
4624 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
4625
4626 // This can be an immediate, but will be folded later.
4627 assert(Val->getReg());
4628
4629 unsigned SubReg;
4630 std::tie(SubReg, Offset) = computeIndirectRegAndOffset(TRI, VecRC,
4631 SrcVec->getReg(),
4632 Offset);
4633 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4634
4635 if (Idx->getReg() == AMDGPU::NoRegister) {
4637 const DebugLoc &DL = MI.getDebugLoc();
4638
4639 assert(Offset == 0);
4640
4641 BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
4642 .add(*SrcVec)
4643 .add(*Val)
4644 .addImm(SubReg);
4645
4646 MI.eraseFromParent();
4647 return &MBB;
4648 }
4649
4650 // Check for a SGPR index.
4651 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4653 const DebugLoc &DL = MI.getDebugLoc();
4654
4655 if (UseGPRIdxMode) {
4657
4658 const MCInstrDesc &GPRIDXDesc =
4659 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
4660 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
4661 .addReg(SrcVec->getReg())
4662 .add(*Val)
4663 .addReg(Idx)
4664 .addImm(SubReg);
4665 } else {
4667
4668 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
4669 TRI.getRegSizeInBits(*VecRC), 32, false);
4670 BuildMI(MBB, I, DL, MovRelDesc, Dst)
4671 .addReg(SrcVec->getReg())
4672 .add(*Val)
4673 .addImm(SubReg);
4674 }
4675 MI.eraseFromParent();
4676 return &MBB;
4677 }
4678
4679 // Control flow needs to be inserted if indexing with a VGPR.
4680 if (Val->isReg())
4681 MRI.clearKillFlags(Val->getReg());
4682
4683 const DebugLoc &DL = MI.getDebugLoc();
4684
4685 Register PhiReg = MRI.createVirtualRegister(VecRC);
4686
4687 Register SGPRIdxReg;
4688 auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, Offset,
4689 UseGPRIdxMode, SGPRIdxReg);
4690 MachineBasicBlock *LoopBB = InsPt->getParent();
4691
4692 if (UseGPRIdxMode) {
4693 const MCInstrDesc &GPRIDXDesc =
4694 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
4695
4696 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
4697 .addReg(PhiReg)
4698 .add(*Val)
4699 .addReg(SGPRIdxReg)
4700 .addImm(AMDGPU::sub0);
4701 } else {
4702 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
4703 TRI.getRegSizeInBits(*VecRC), 32, false);
4704 BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst)
4705 .addReg(PhiReg)
4706 .add(*Val)
4707 .addImm(AMDGPU::sub0);
4708 }
4709
4710 MI.eraseFromParent();
4711 return LoopBB;
4712}
4713
4716 const GCNSubtarget &ST,
4717 unsigned Opc) {
4719 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4720 const DebugLoc &DL = MI.getDebugLoc();
4721 const SIInstrInfo *TII = ST.getInstrInfo();
4722
4723 // Reduction operations depend on whether the input operand is SGPR or VGPR.
4724 Register SrcReg = MI.getOperand(1).getReg();
4725 bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg));
4726 Register DstReg = MI.getOperand(0).getReg();
4727 MachineBasicBlock *RetBB = nullptr;
4728 if (isSGPR) {
4729 // These operations with a uniform value i.e. SGPR are idempotent.
4730 // Reduced value will be same as given sgpr.
4731 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
4732 RetBB = &BB;
4733 } else {
4734 // TODO: Implement DPP Strategy and switch based on immediate strategy
4735 // operand. For now, for all the cases (default, Iterative and DPP we use
4736 // iterative approach by default.)
4737
4738 // To reduce the VGPR using iterative approach, we need to iterate
4739 // over all the active lanes. Lowering consists of ComputeLoop,
4740 // which iterate over only active lanes. We use copy of EXEC register
4741 // as induction variable and every active lane modifies it using bitset0
4742 // so that we will get the next active lane for next iteration.
4744 Register SrcReg = MI.getOperand(1).getReg();
4745
4746 // Create Control flow for loop
4747 // Split MI's Machine Basic block into For loop
4748 auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);
4749
4750 // Create virtual registers required for lowering.
4751 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
4752 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
4753 Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
4754 Register InitalValReg = MRI.createVirtualRegister(DstRegClass);
4755
4756 Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
4757 Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
4758 Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
4759
4760 Register FF1Reg = MRI.createVirtualRegister(DstRegClass);
4761 Register LaneValueReg = MRI.createVirtualRegister(DstRegClass);
4762
4763 bool IsWave32 = ST.isWave32();
4764 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4765 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4766
4767 // Create initail values of induction variable from Exec, Accumulator and
4768 // insert branch instr to newly created ComputeBlockk
4769 uint32_t InitalValue =
4770 (Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
4771 auto TmpSReg =
4772 BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
4773 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
4774 .addImm(InitalValue);
4775 BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH)).addMBB(ComputeLoop);
4776
4777 // Start constructing ComputeLoop
4778 I = ComputeLoop->end();
4779 auto Accumulator =
4780 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
4781 .addReg(InitalValReg)
4782 .addMBB(&BB);
4783 auto ActiveBits =
4784 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
4785 .addReg(TmpSReg->getOperand(0).getReg())
4786 .addMBB(&BB);
4787
4788 // Perform the computations
4789 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
4790 auto FF1 = BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
4791 .addReg(ActiveBits->getOperand(0).getReg());
4792 auto LaneValue = BuildMI(*ComputeLoop, I, DL,
4793 TII->get(AMDGPU::V_READLANE_B32), LaneValueReg)
4794 .addReg(SrcReg)
4795 .addReg(FF1->getOperand(0).getReg());
4796 auto NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
4797 .addReg(Accumulator->getOperand(0).getReg())
4798 .addReg(LaneValue->getOperand(0).getReg());
4799
4800 // Manipulate the iterator to get the next active lane
4801 unsigned BITSETOpc =
4802 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
4803 auto NewActiveBits =
4804 BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
4805 .addReg(FF1->getOperand(0).getReg())
4806 .addReg(ActiveBits->getOperand(0).getReg());
4807
4808 // Add phi nodes
4809 Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
4810 .addMBB(ComputeLoop);
4811 ActiveBits.addReg(NewActiveBits->getOperand(0).getReg())
4812 .addMBB(ComputeLoop);
4813
4814 // Creating branching
4815 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
4816 BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
4817 .addReg(NewActiveBits->getOperand(0).getReg())
4818 .addImm(0);
4819 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
4820 .addMBB(ComputeLoop);
4821
4822 RetBB = ComputeEnd;
4823 }
4824 MI.eraseFromParent();
4825 return RetBB;
4826}
4827
4829 MachineInstr &MI, MachineBasicBlock *BB) const {
4830
4832 MachineFunction *MF = BB->getParent();
4834
4835 switch (MI.getOpcode()) {
4836 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
4837 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
4838 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
4839 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
4840 case AMDGPU::S_UADDO_PSEUDO:
4841 case AMDGPU::S_USUBO_PSEUDO: {
4842 const DebugLoc &DL = MI.getDebugLoc();
4843 MachineOperand &Dest0 = MI.getOperand(0);
4844 MachineOperand &Dest1 = MI.getOperand(1);
4845 MachineOperand &Src0 = MI.getOperand(2);
4846 MachineOperand &Src1 = MI.getOperand(3);
4847
4848 unsigned Opc = (MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
4849 ? AMDGPU::S_ADD_I32
4850 : AMDGPU::S_SUB_I32;
4851 BuildMI(*BB, MI, DL, TII->get(Opc), Dest0.getReg()).add(Src0).add(Src1);
4852
4853 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B64), Dest1.getReg())
4854 .addImm(1)
4855 .addImm(0);
4856
4857 MI.eraseFromParent();
4858 return BB;
4859 }
4860 case AMDGPU::S_ADD_U64_PSEUDO:
4861 case AMDGPU::S_SUB_U64_PSEUDO: {
4862 // For targets older than GFX12, we emit a sequence of 32-bit operations.
4863 // For GFX12, we emit s_add_u64 and s_sub_u64.
4864 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4866 const DebugLoc &DL = MI.getDebugLoc();
4867 MachineOperand &Dest = MI.getOperand(0);
4868 MachineOperand &Src0 = MI.getOperand(1);
4869 MachineOperand &Src1 = MI.getOperand(2);
4870 bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
4871 if (Subtarget->hasScalarAddSub64()) {
4872 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
4873 BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
4874 .add(Src0)
4875 .add(Src1);
4876 } else {
4877 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4878 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
4879
4880 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4881 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4882
4883 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
4884 MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
4885 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
4886 MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
4887
4888 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
4889 MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
4890 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
4891 MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
4892
4893 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
4894 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
4895 BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
4896 .add(Src0Sub0)
4897 .add(Src1Sub0);
4898 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
4899 .add(Src0Sub1)
4900 .add(Src1Sub1);
4901 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
4902 .addReg(DestSub0)
4903 .addImm(AMDGPU::sub0)
4904 .addReg(DestSub1)
4905 .addImm(AMDGPU::sub1);
4906 }
4907 MI.eraseFromParent();
4908 return BB;
4909 }
4910 case AMDGPU::V_ADD_U64_PSEUDO:
4911 case AMDGPU::V_SUB_U64_PSEUDO: {
4913 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4914 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4915 const DebugLoc &DL = MI.getDebugLoc();
4916
4917 bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
4918
4919 MachineOperand &Dest = MI.getOperand(0);
4920 MachineOperand &Src0 = MI.getOperand(1);
4921 MachineOperand &Src1 = MI.getOperand(2);
4922
4923 if (IsAdd && ST.hasLshlAddB64()) {
4924 auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64),
4925 Dest.getReg())
4926 .add(Src0)
4927 .addImm(0)
4928 .add(Src1);
4929 TII->legalizeOperands(*Add);
4930 MI.eraseFromParent();
4931 return BB;
4932 }
4933
4934 const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
4935
4936 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4937 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4938
4939 Register CarryReg = MRI.createVirtualRegister(CarryRC);
4940 Register DeadCarryReg = MRI.createVirtualRegister(CarryRC);
4941
4942 const TargetRegisterClass *Src0RC = Src0.isReg()
4943 ? MRI.getRegClass(Src0.getReg())
4944 : &AMDGPU::VReg_64RegClass;
4945 const TargetRegisterClass *Src1RC = Src1.isReg()
4946 ? MRI.getRegClass(Src1.getReg())
4947 : &AMDGPU::VReg_64RegClass;
4948
4949 const TargetRegisterClass *Src0SubRC =
4950 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
4951 const TargetRegisterClass *Src1SubRC =
4952 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
4953
4954 MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
4955 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
4956 MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm(
4957 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
4958
4959 MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
4960 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
4961 MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm(
4962 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
4963
4964 unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
4965 MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
4966 .addReg(CarryReg, RegState::Define)
4967 .add(SrcReg0Sub0)
4968 .add(SrcReg1Sub0)
4969 .addImm(0); // clamp bit
4970
4971 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
4972 MachineInstr *HiHalf =
4973 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
4974 .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
4975 .add(SrcReg0Sub1)
4976 .add(SrcReg1Sub1)
4977 .addReg(CarryReg, RegState::Kill)
4978 .addImm(0); // clamp bit
4979
4980 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
4981 .addReg(DestSub0)
4982 .addImm(AMDGPU::sub0)
4983 .addReg(DestSub1)
4984 .addImm(AMDGPU::sub1);
4985 TII->legalizeOperands(*LoHalf);
4986 TII->legalizeOperands(*HiHalf);
4987 MI.eraseFromParent();
4988 return BB;
4989 }
4990 case AMDGPU::S_ADD_CO_PSEUDO:
4991 case AMDGPU::S_SUB_CO_PSEUDO: {
4992 // This pseudo has a chance to be selected
4993 // only from uniform add/subcarry node. All the VGPR operands
4994 // therefore assumed to be splat vectors.
4996 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4997 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4999 const DebugLoc &DL = MI.getDebugLoc();
5000 MachineOperand &Dest = MI.getOperand(0);
5001 MachineOperand &CarryDest = MI.getOperand(1);
5002 MachineOperand &Src0 = MI.getOperand(2);
5003 MachineOperand &Src1 = MI.getOperand(3);
5004 MachineOperand &Src2 = MI.getOperand(4);
5005 unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
5006 ? AMDGPU::S_ADDC_U32
5007 : AMDGPU::S_SUBB_U32;
5008 if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) {
5009 Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5010 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
5011 .addReg(Src0.getReg());
5012 Src0.setReg(RegOp0);
5013 }
5014 if (Src1.isReg() && TRI->isVectorRegister(MRI, Src1.getReg())) {
5015 Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5016 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
5017 .addReg(Src1.getReg());
5018 Src1.setReg(RegOp1);
5019 }
5020 Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5021 if (TRI->isVectorRegister(MRI, Src2.getReg())) {
5022 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
5023 .addReg(Src2.getReg());
5024 Src2.setReg(RegOp2);
5025 }
5026
5027 const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg());
5028 unsigned WaveSize = TRI->getRegSizeInBits(*Src2RC);
5029 assert(WaveSize == 64 || WaveSize == 32);
5030
5031 if (WaveSize == 64) {
5032 if (ST.hasScalarCompareEq64()) {
5033 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))
5034 .addReg(Src2.getReg())
5035 .addImm(0);
5036 } else {
5037 const TargetRegisterClass *SubRC =
5038 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
5039 MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm(
5040 MII, MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
5041 MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm(
5042 MII, MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
5043 Register Src2_32 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5044
5045 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_OR_B32), Src2_32)
5046 .add(Src2Sub0)
5047 .add(Src2Sub1);
5048
5049 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5050 .addReg(Src2_32, RegState::Kill)
5051 .addImm(0);
5052 }
5053 } else {
5054 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5055 .addReg(Src2.getReg())
5056 .addImm(0);
5057 }
5058
5059 BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg()).add(Src0).add(Src1);
5060
5061 unsigned SelOpc =
5062 (WaveSize == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
5063
5064 BuildMI(*BB, MII, DL, TII->get(SelOpc), CarryDest.getReg())
5065 .addImm(-1)
5066 .addImm(0);
5067
5068 MI.eraseFromParent();
5069 return BB;
5070 }
5071 case AMDGPU::SI_INIT_M0: {
5072 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
5073 TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
5074 .add(MI.getOperand(0));
5075 MI.eraseFromParent();
5076 return BB;
5077 }
5078 case AMDGPU::GET_GROUPSTATICSIZE: {
5079 assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
5080 getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
5081 DebugLoc DL = MI.getDebugLoc();
5082 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
5083 .add(MI.getOperand(0))
5084 .addImm(MFI->getLDSSize());
5085 MI.eraseFromParent();
5086 return BB;
5087 }
5088 case AMDGPU::GET_SHADERCYCLESHILO: {
5091 const DebugLoc &DL = MI.getDebugLoc();
5092 // The algorithm is:
5093 //
5094 // hi1 = getreg(SHADER_CYCLES_HI)
5095 // lo1 = getreg(SHADER_CYCLES_LO)
5096 // hi2 = getreg(SHADER_CYCLES_HI)
5097 //
5098 // If hi1 == hi2 then there was no overflow and the result is hi2:lo1.
5099 // Otherwise there was overflow and the result is hi2:0. In both cases the
5100 // result should represent the actual time at some point during the sequence
5101 // of three getregs.
5102 using namespace AMDGPU::Hwreg;
5103 Register RegHi1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5104 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi1)
5105 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5106 Register RegLo1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5107 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegLo1)
5108 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
5109 Register RegHi2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5110 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi2)
5111 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5112 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
5113 .addReg(RegHi1)
5114 .addReg(RegHi2);
5115 Register RegLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5116 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), RegLo)
5117 .addReg(RegLo1)
5118 .addImm(0);
5119 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE))
5120 .add(MI.getOperand(0))
5121 .addReg(RegLo)
5122 .addImm(AMDGPU::sub0)
5123 .addReg(RegHi2)
5124 .addImm(AMDGPU::sub1);
5125 MI.eraseFromParent();
5126 return BB;
5127 }
5128 case AMDGPU::SI_INDIRECT_SRC_V1:
5129 case AMDGPU::SI_INDIRECT_SRC_V2:
5130 case AMDGPU::SI_INDIRECT_SRC_V4:
5131 case AMDGPU::SI_INDIRECT_SRC_V8:
5132 case AMDGPU::SI_INDIRECT_SRC_V9:
5133 case AMDGPU::SI_INDIRECT_SRC_V10:
5134 case AMDGPU::SI_INDIRECT_SRC_V11:
5135 case AMDGPU::SI_INDIRECT_SRC_V12:
5136 case AMDGPU::SI_INDIRECT_SRC_V16:
5137 case AMDGPU::SI_INDIRECT_SRC_V32:
5138 return emitIndirectSrc(MI, *BB, *getSubtarget());
5139 case AMDGPU::SI_INDIRECT_DST_V1:
5140 case AMDGPU::SI_INDIRECT_DST_V2:
5141 case AMDGPU::SI_INDIRECT_DST_V4:
5142 case AMDGPU::SI_INDIRECT_DST_V8:
5143 case AMDGPU::SI_INDIRECT_DST_V9:
5144 case AMDGPU::SI_INDIRECT_DST_V10:
5145 case AMDGPU::SI_INDIRECT_DST_V11:
5146 case AMDGPU::SI_INDIRECT_DST_V12:
5147 case AMDGPU::SI_INDIRECT_DST_V16:
5148 case AMDGPU::SI_INDIRECT_DST_V32:
5149 return emitIndirectDst(MI, *BB, *getSubtarget());
5150 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
5151 case AMDGPU::SI_KILL_I1_PSEUDO:
5152 return splitKillBlock(MI, BB);
5153 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
5155 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5156 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5157
5158 Register Dst = MI.getOperand(0).getReg();
5159 const MachineOperand &Src0 = MI.getOperand(1);
5160 const MachineOperand &Src1 = MI.getOperand(2);
5161 const DebugLoc &DL = MI.getDebugLoc();
5162 Register SrcCond = MI.getOperand(3).getReg();
5163
5164 Register DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5165 Register DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5166 const auto *CondRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
5167 Register SrcCondCopy = MRI.createVirtualRegister(CondRC);
5168
5169 const TargetRegisterClass *Src0RC = Src0.isReg()
5170 ? MRI.getRegClass(Src0.getReg())
5171 : &AMDGPU::VReg_64RegClass;
5172 const TargetRegisterClass *Src1RC = Src1.isReg()
5173 ? MRI.getRegClass(Src1.getReg())
5174 : &AMDGPU::VReg_64RegClass;
5175
5176 const TargetRegisterClass *Src0SubRC =
5177 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5178 const TargetRegisterClass *Src1SubRC =
5179 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5180
5181 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5182 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5183 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5184 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5185
5186 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5187 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5188 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5189 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5190
5191 BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy)
5192 .addReg(SrcCond);
5193 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
5194 .addImm(0)
5195 .add(Src0Sub0)
5196 .addImm(0)
5197 .add(Src1Sub0)
5198 .addReg(SrcCondCopy);
5199 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
5200 .addImm(0)
5201 .add(Src0Sub1)
5202 .addImm(0)
5203 .add(Src1Sub1)
5204 .addReg(SrcCondCopy);
5205
5206 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
5207 .addReg(DstLo)
5208 .addImm(AMDGPU::sub0)
5209 .addReg(DstHi)
5210 .addImm(AMDGPU::sub1);
5211 MI.eraseFromParent();
5212 return BB;
5213 }
5214 case AMDGPU::SI_BR_UNDEF: {
5216 const DebugLoc &DL = MI.getDebugLoc();
5217 MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5218 .add(MI.getOperand(0));
5219 Br->getOperand(1).setIsUndef(); // read undef SCC
5220 MI.eraseFromParent();
5221 return BB;
5222 }
5223 case AMDGPU::ADJCALLSTACKUP:
5224 case AMDGPU::ADJCALLSTACKDOWN: {
5226 MachineInstrBuilder MIB(*MF, &MI);
5227 MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
5228 .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
5229 return BB;
5230 }
5231 case AMDGPU::SI_CALL_ISEL: {
5233 const DebugLoc &DL = MI.getDebugLoc();
5234
5235 unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
5236
5238 MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
5239
5240 for (const MachineOperand &MO : MI.operands())
5241 MIB.add(MO);
5242
5243 MIB.cloneMemRefs(MI);
5244 MI.eraseFromParent();
5245 return BB;
5246 }
5247 case AMDGPU::V_ADD_CO_U32_e32:
5248 case AMDGPU::V_SUB_CO_U32_e32:
5249 case AMDGPU::V_SUBREV_CO_U32_e32: {
5250 // TODO: Define distinct V_*_I32_Pseudo instructions instead.
5251 const DebugLoc &DL = MI.getDebugLoc();
5252 unsigned Opc = MI.getOpcode();
5253
5254 bool NeedClampOperand = false;
5255 if (TII->pseudoToMCOpcode(Opc) == -1) {
5256 Opc = AMDGPU::getVOPe64(Opc);
5257 NeedClampOperand = true;
5258 }
5259
5260 auto I = BuildMI(*BB, MI, DL, TII->get(Opc), MI.getOperand(0).getReg());
5261 if (TII->isVOP3(*I)) {
5262 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5263 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5264 I.addReg(TRI->getVCC(), RegState::Define);
5265 }
5266 I.add(MI.getOperand(1))
5267 .add(MI.getOperand(2));
5268 if (NeedClampOperand)
5269 I.addImm(0); // clamp bit for e64 encoding
5270
5271 TII->legalizeOperands(*I);
5272
5273 MI.eraseFromParent();
5274 return BB;
5275 }
5276 case AMDGPU::V_ADDC_U32_e32:
5277 case AMDGPU::V_SUBB_U32_e32:
5278 case AMDGPU::V_SUBBREV_U32_e32:
5279 // These instructions have an implicit use of vcc which counts towards the
5280 // constant bus limit.
5281 TII->legalizeOperands(MI);
5282 return BB;
5283 case AMDGPU::DS_GWS_INIT:
5284 case AMDGPU::DS_GWS_SEMA_BR:
5285 case AMDGPU::DS_GWS_BARRIER:
5286 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::data0);
5287 [[fallthrough]];
5288 case AMDGPU::DS_GWS_SEMA_V:
5289 case AMDGPU::DS_GWS_SEMA_P:
5290 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
5291 // A s_waitcnt 0 is required to be the instruction immediately following.
5292 if (getSubtarget()->hasGWSAutoReplay()) {
5294 return BB;
5295 }
5296
5297 return emitGWSMemViolTestLoop(MI, BB);
5298 case AMDGPU::S_SETREG_B32: {
5299 // Try to optimize cases that only set the denormal mode or rounding mode.
5300 //
5301 // If the s_setreg_b32 fully sets all of the bits in the rounding mode or
5302 // denormal mode to a constant, we can use s_round_mode or s_denorm_mode
5303 // instead.
5304 //
5305 // FIXME: This could be predicates on the immediate, but tablegen doesn't
5306 // allow you to have a no side effect instruction in the output of a
5307 // sideeffecting pattern.
5308 auto [ID, Offset, Width] =
5309 AMDGPU::Hwreg::HwregEncoding::decode(MI.getOperand(1).getImm());
5311 return BB;
5312
5313 const unsigned WidthMask = maskTrailingOnes<unsigned>(Width);
5314 const unsigned SetMask = WidthMask << Offset;
5315
5316 if (getSubtarget()->hasDenormModeInst()) {
5317 unsigned SetDenormOp = 0;
5318 unsigned SetRoundOp = 0;
5319
5320 // The dedicated instructions can only set the whole denorm or round mode
5321 // at once, not a subset of bits in either.
5322 if (SetMask ==
5324 // If this fully sets both the round and denorm mode, emit the two
5325 // dedicated instructions for these.
5326 SetRoundOp = AMDGPU::S_ROUND_MODE;
5327 SetDenormOp = AMDGPU::S_DENORM_MODE;
5328 } else if (SetMask == AMDGPU::Hwreg::FP_ROUND_MASK) {
5329 SetRoundOp = AMDGPU::S_ROUND_MODE;
5330 } else if (SetMask == AMDGPU::Hwreg::FP_DENORM_MASK) {
5331 SetDenormOp = AMDGPU::S_DENORM_MODE;
5332 }
5333
5334 if (SetRoundOp || SetDenormOp) {
5336 MachineInstr *Def = MRI.getVRegDef(MI.getOperand(0).getReg());
5337 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
5338 unsigned ImmVal = Def->getOperand(1).getImm();
5339 if (SetRoundOp) {
5340 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetRoundOp))
5341 .addImm(ImmVal & 0xf);
5342
5343 // If we also have the denorm mode, get just the denorm mode bits.
5344 ImmVal >>= 4;
5345 }
5346
5347 if (SetDenormOp) {
5348 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetDenormOp))
5349 .addImm(ImmVal & 0xf);
5350 }
5351
5352 MI.eraseFromParent();
5353 return BB;
5354 }
5355 }
5356 }
5357
5358 // If only FP bits are touched, used the no side effects pseudo.
5359 if ((SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK |
5360 AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask)
5361 MI.setDesc(TII->get(AMDGPU::S_SETREG_B32_mode));
5362
5363 return BB;
5364 }
5365 case AMDGPU::S_INVERSE_BALLOT_U32:
5366 case AMDGPU::S_INVERSE_BALLOT_U64: {
5368 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5369 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5370 const DebugLoc &DL = MI.getDebugLoc();
5371 const Register DstReg = MI.getOperand(0).getReg();
5372 Register MaskReg = MI.getOperand(1).getReg();
5373
5374 const bool IsVALU = TRI->isVectorRegister(MRI, MaskReg);
5375
5376 if (IsVALU) {
5377 MaskReg = TII->readlaneVGPRToSGPR(MaskReg, MI, MRI);
5378 }
5379
5380 BuildMI(*BB, &MI, DL, TII->get(AMDGPU::COPY), DstReg).addReg(MaskReg);
5381 MI.eraseFromParent();
5382 return BB;
5383 }
5384 case AMDGPU::ENDPGM_TRAP: {
5385 const DebugLoc &DL = MI.getDebugLoc();
5386 if (BB->succ_empty() && std::next(MI.getIterator()) == BB->end()) {
5387 MI.setDesc(TII->get(AMDGPU::S_ENDPGM));
5388 MI.addOperand(MachineOperand::CreateImm(0));
5389 return BB;
5390 }
5391
5392 // We need a block split to make the real endpgm a terminator. We also don't
5393 // want to break phis in successor blocks, so we can't just delete to the
5394 // end of the block.
5395
5396 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
5398 MF->push_back(TrapBB);
5399 BuildMI(*TrapBB, TrapBB->end(), DL, TII->get(AMDGPU::S_ENDPGM))
5400 .addImm(0);
5401 BuildMI(*BB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
5402 .addMBB(TrapBB);
5403
5404 BB->addSuccessor(TrapBB);
5405 MI.eraseFromParent();
5406 return SplitBB;
5407 }
5408 case AMDGPU::SIMULATED_TRAP: {
5409 assert(Subtarget->hasPrivEnabledTrap2NopBug());
5411 MachineBasicBlock *SplitBB =
5412 TII->insertSimulatedTrap(MRI, *BB, MI, MI.getDebugLoc());
5413 MI.eraseFromParent();
5414 return SplitBB;
5415 }
5416 default:
5417 if (TII->isImage(MI) || TII->isMUBUF(MI)) {
5418 if (!MI.mayStore())
5420 return BB;
5421 }
5423 }
5424}
5425
5427 // This currently forces unfolding various combinations of fsub into fma with
5428 // free fneg'd operands. As long as we have fast FMA (controlled by
5429 // isFMAFasterThanFMulAndFAdd), we should perform these.
5430
5431 // When fma is quarter rate, for f64 where add / sub are at best half rate,
5432 // most of these combines appear to be cycle neutral but save on instruction
5433 // count / code size.
5434 return true;
5435}
5436
5438
5440 EVT VT) const {
5441 if (!VT.isVector()) {
5442 return MVT::i1;
5443 }
5444 return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
5445}
5446
5448 // TODO: Should i16 be used always if legal? For now it would force VALU
5449 // shifts.
5450 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
5451}
5452
5454 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
5455 ? Ty.changeElementSize(16)
5456 : Ty.changeElementSize(32);
5457}
5458
5459// Answering this is somewhat tricky and depends on the specific device which
5460// have different rates for fma or all f64 operations.
5461//
5462// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
5463// regardless of which device (although the number of cycles differs between
5464// devices), so it is always profitable for f64.
5465//
5466// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
5467// only on full rate devices. Normally, we should prefer selecting v_mad_f32
5468// which we can always do even without fused FP ops since it returns the same
5469// result as the separate operations and since it is always full
5470// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
5471// however does not support denormals, so we do report fma as faster if we have
5472// a fast fma device and require denormals.
5473//
5475 EVT VT) const {
5476 VT = VT.getScalarType();
5477
5478 switch (VT.getSimpleVT().SimpleTy) {
5479 case MVT::f32: {
5480 // If mad is not available this depends only on if f32 fma is full rate.
5481 if (!Subtarget->hasMadMacF32Insts())
5482 return Subtarget->hasFastFMAF32();
5483
5484 // Otherwise f32 mad is always full rate and returns the same result as
5485 // the separate operations so should be preferred over fma.
5486 // However does not support denormals.
5488 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
5489
5490 // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
5491 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
5492 }
5493 case MVT::f64:
5494 return true;
5495 case MVT::f16:
5496 return Subtarget->has16BitInsts() && !denormalModeIsFlushAllF64F16(MF);
5497 default:
5498 break;
5499 }
5500
5501 return false;
5502}
5503
5505 LLT Ty) const {
5506 switch (Ty.getScalarSizeInBits()) {
5507 case 16:
5508 return isFMAFasterThanFMulAndFAdd(MF, MVT::f16);
5509 case 32:
5510 return isFMAFasterThanFMulAndFAdd(MF, MVT::f32);
5511 case 64:
5512 return isFMAFasterThanFMulAndFAdd(MF, MVT::f64);
5513 default:
5514 break;
5515 }
5516
5517 return false;
5518}
5519
5521 if (!Ty.isScalar())
5522 return false;
5523
5524 if (Ty.getScalarSizeInBits() == 16)
5525 return Subtarget->hasMadF16() && denormalModeIsFlushAllF64F16(*MI.getMF());
5526 if (Ty.getScalarSizeInBits() == 32)
5527 return Subtarget->hasMadMacF32Insts() &&
5528 denormalModeIsFlushAllF32(*MI.getMF());
5529
5530 return false;
5531}
5532
5534 const SDNode *N) const {
5535 // TODO: Check future ftz flag
5536 // v_mad_f32/v_mac_f32 do not support denormals.
5537 EVT VT = N->getValueType(0);
5538 if (VT == MVT::f32)
5539 return Subtarget->hasMadMacF32Insts() &&
5541 if (VT == MVT::f16) {
5542 return Subtarget->hasMadF16() &&
5544 }
5545
5546 return false;
5547}
5548
5549//===----------------------------------------------------------------------===//
5550// Custom DAG Lowering Operations
5551//===----------------------------------------------------------------------===//
5552
5553// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
5554// wider vector type is legal.
5556 SelectionDAG &DAG) const {
5557 unsigned Opc = Op.getOpcode();
5558 EVT VT = Op.getValueType();
5559 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
5560 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
5561 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5562 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
5563
5564 SDValue Lo, Hi;
5565 std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
5566
5567 SDLoc SL(Op);
5568 SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo,
5569 Op->getFlags());
5570 SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi,
5571 Op->getFlags());
5572
5573 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
5574}
5575
5576// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
5577// wider vector type is legal.
5579 SelectionDAG &DAG) const {
5580 unsigned Opc = Op.getOpcode();
5581 EVT VT = Op.getValueType();
5582 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
5583 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
5584 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5585 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
5586
5587 SDValue Lo0, Hi0;
5588 std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0);
5589 SDValue Lo1, Hi1;
5590 std::tie(Lo1, Hi1) = DAG.SplitVectorOperand(Op.getNode(), 1);
5591
5592 SDLoc SL(Op);
5593
5594 SDValue OpLo = DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1,
5595 Op->getFlags());
5596 SDValue OpHi = DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1,
5597 Op->getFlags());
5598
5599 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
5600}
5601
5603 SelectionDAG &DAG) const {
5604 unsigned Opc = Op.getOpcode();
5605 EVT VT = Op.getValueType();
5606 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
5607 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
5608 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5609 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
5610 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
5611 VT == MVT::v32bf16);
5612
5613 SDValue Lo0, Hi0;
5614 SDValue Op0 = Op.getOperand(0);
5615 std::tie(Lo0, Hi0) = Op0.getValueType().isVector()
5616 ? DAG.SplitVectorOperand(Op.getNode(), 0)
5617 : std::pair(Op0, Op0);
5618 SDValue Lo1, Hi1;
5619 std::tie(Lo1, Hi1) = DAG.SplitVectorOperand(Op.getNode(), 1);
5620 SDValue Lo2, Hi2;
5621 std::tie(Lo2, Hi2) = DAG.SplitVectorOperand(Op.getNode(), 2);
5622
5623 SDLoc SL(Op);
5624 auto ResVT = DAG.GetSplitDestVTs(VT);
5625
5626 SDValue OpLo = DAG.getNode(Opc, SL, ResVT.first, Lo0, Lo1, Lo2,
5627 Op->getFlags());
5628 SDValue OpHi = DAG.getNode(Opc, SL, ResVT.second, Hi0, Hi1, Hi2,
5629 Op->getFlags());
5630
5631 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
5632}
5633
5634
5636 switch (Op.getOpcode()) {
5637 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
5638 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
5639 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
5640 case ISD::LOAD: {
5641 SDValue Result = LowerLOAD(Op, DAG);
5642 assert((!Result.getNode() ||
5643 Result.getNode()->getNumValues() == 2) &&
5644 "Load should return a value and a chain");
5645 return Result;
5646 }
5647 case ISD::FSQRT: {
5648 EVT VT = Op.getValueType();
5649 if (VT == MVT::f32)
5650 return lowerFSQRTF32(Op, DAG);
5651 if (VT == MVT::f64)
5652 return lowerFSQRTF64(Op, DAG);
5653 return SDValue();
5654 }
5655 case ISD::FSIN:
5656 case ISD::FCOS:
5657 return LowerTrig(Op, DAG);
5658 case ISD::SELECT: return LowerSELECT(Op, DAG);
5659 case ISD::FDIV: return LowerFDIV(Op, DAG);
5660 case ISD::FFREXP: return LowerFFREXP(Op, DAG);
5661 case ISD::ATOMIC_CMP_SWAP: return LowerATOMIC_CMP_SWAP(Op, DAG);
5662 case ISD::STORE: return LowerSTORE(Op, DAG);
5663 case ISD::GlobalAddress: {
5666 return LowerGlobalAddress(MFI, Op, DAG);
5667 }
5668 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
5669 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG);
5670 case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG);
5671 case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG);
5673 return lowerINSERT_SUBVECTOR(Op, DAG);
5675 return lowerINSERT_VECTOR_ELT(Op, DAG);
5677 return lowerEXTRACT_VECTOR_ELT(Op, DAG);
5679 return lowerVECTOR_SHUFFLE(Op, DAG);
5681 return lowerSCALAR_TO_VECTOR(Op, DAG);
5682 case ISD::BUILD_VECTOR:
5683 return lowerBUILD_VECTOR(Op, DAG);
5684 case ISD::FP_ROUND:
5686 return lowerFP_ROUND(Op, DAG);
5687 case ISD::FPTRUNC_ROUND: {
5688 unsigned Opc;
5689 SDLoc DL(Op);
5690
5691 if (Op.getOperand(0)->getValueType(0) != MVT::f32)
5692 return SDValue();
5693
5694 // Get the rounding mode from the last operand
5695 int RoundMode = Op.getConstantOperandVal(1);
5696 if (RoundMode == (int)RoundingMode::TowardPositive)
5698 else if (RoundMode == (int)RoundingMode::TowardNegative)
5700 else
5701 return SDValue();
5702
5703 return DAG.getNode(Opc, DL, Op.getNode()->getVTList(), Op->getOperand(0));
5704 }
5705 case ISD::TRAP:
5706 return lowerTRAP(Op, DAG);
5707 case ISD::DEBUGTRAP:
5708 return lowerDEBUGTRAP(Op, DAG);
5709 case ISD::FABS:
5710 case ISD::FNEG:
5711 case ISD::FCANONICALIZE:
5712 case ISD::BSWAP:
5713 return splitUnaryVectorOp(Op, DAG);
5714 case ISD::FMINNUM:
5715 case ISD::FMAXNUM:
5716 return lowerFMINNUM_FMAXNUM(Op, DAG);
5717 case ISD::FLDEXP:
5718 case ISD::STRICT_FLDEXP:
5719 return lowerFLDEXP(Op, DAG);
5720 case ISD::FMA:
5721 return splitTernaryVectorOp(Op, DAG);
5722 case ISD::FP_TO_SINT:
5723 case ISD::FP_TO_UINT:
5724 return LowerFP_TO_INT(Op, DAG);
5725 case ISD::SHL:
5726 case ISD::SRA:
5727 case ISD::SRL:
5728 case ISD::ADD:
5729 case ISD::SUB:
5730 case ISD::SMIN:
5731 case ISD::SMAX:
5732 case ISD::UMIN:
5733 case ISD::UMAX:
5734 case ISD::FADD:
5735 case ISD::FMUL:
5736 case ISD::FMINNUM_IEEE:
5737 case ISD::FMAXNUM_IEEE:
5738 case ISD::UADDSAT:
5739 case ISD::USUBSAT:
5740 case ISD::SADDSAT:
5741 case ISD::SSUBSAT:
5742 return splitBinaryVectorOp(Op, DAG);
5743 case ISD::MUL:
5744 return lowerMUL(Op, DAG);
5745 case ISD::SMULO:
5746 case ISD::UMULO:
5747 return lowerXMULO(Op, DAG);
5748 case ISD::SMUL_LOHI:
5749 case ISD::UMUL_LOHI:
5750 return lowerXMUL_LOHI(Op, DAG);
5752 return LowerDYNAMIC_STACKALLOC(Op, DAG);
5753 case ISD::STACKSAVE:
5754 return LowerSTACKSAVE(Op, DAG);
5755 case ISD::GET_ROUNDING:
5756 return lowerGET_ROUNDING(Op, DAG);
5757 case ISD::PREFETCH:
5758 return lowerPREFETCH(Op, DAG);
5759 case ISD::FP_EXTEND:
5761 return lowerFP_EXTEND(Op, DAG);
5762 case ISD::GET_FPENV:
5763 return lowerGET_FPENV(Op, DAG);
5764 case ISD::SET_FPENV:
5765 return lowerSET_FPENV(Op, DAG);
5766 }
5767 return SDValue();
5768}
5769
5770// Used for D16: Casts the result of an instruction into the right vector,
5771// packs values if loads return unpacked values.
5773 const SDLoc &DL,
5774 SelectionDAG &DAG, bool Unpacked) {
5775 if (!LoadVT.isVector())
5776 return Result;
5777
5778 // Cast back to the original packed type or to a larger type that is a
5779 // multiple of 32 bit for D16. Widening the return type is a required for
5780 // legalization.
5781 EVT FittingLoadVT = LoadVT;
5782 if ((LoadVT.getVectorNumElements() % 2) == 1) {
5783 FittingLoadVT =
5785 LoadVT.getVectorNumElements() + 1);
5786 }
5787
5788 if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
5789 // Truncate to v2i16/v4i16.
5790 EVT IntLoadVT = FittingLoadVT.changeTypeToInteger();
5791
5792 // Workaround legalizer not scalarizing truncate after vector op
5793 // legalization but not creating intermediate vector trunc.
5795 DAG.ExtractVectorElements(Result, Elts);
5796 for (SDValue &Elt : Elts)
5797 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
5798
5799 // Pad illegal v1i16/v3fi6 to v4i16
5800 if ((LoadVT.getVectorNumElements() % 2) == 1)
5801 Elts.push_back(DAG.getUNDEF(MVT::i16));
5802
5803 Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
5804
5805 // Bitcast to original type (v2f16/v4f16).
5806 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
5807 }
5808
5809 // Cast back to the original packed type.
5810 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
5811}
5812
5813SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode,
5814 MemSDNode *M,
5815 SelectionDAG &DAG,
5817 bool IsIntrinsic) const {
5818 SDLoc DL(M);
5819
5820 bool Unpacked = Subtarget->hasUnpackedD16VMem();
5821 EVT LoadVT = M->getValueType(0);
5822
5823 EVT EquivLoadVT = LoadVT;
5824 if (LoadVT.isVector()) {
5825 if (Unpacked) {
5826 EquivLoadVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
5827 LoadVT.getVectorNumElements());
5828 } else if ((LoadVT.getVectorNumElements() % 2) == 1) {
5829 // Widen v3f16 to legal type
5830 EquivLoadVT =
5832 LoadVT.getVectorNumElements() + 1);
5833 }
5834 }
5835
5836 // Change from v4f16/v2f16 to EquivLoadVT.
5837 SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
5838
5840 = DAG.getMemIntrinsicNode(
5841 IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL,
5842 VTList, Ops, M->getMemoryVT(),
5843 M->getMemOperand());
5844
5845 SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
5846
5847 return DAG.getMergeValues({ Adjusted, Load.getValue(1) }, DL);
5848}
5849
5850SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat,
5851 SelectionDAG &DAG,
5852 ArrayRef<SDValue> Ops) const {
5853 SDLoc DL(M);
5854 EVT LoadVT = M->getValueType(0);
5855 EVT EltType = LoadVT.getScalarType();
5856 EVT IntVT = LoadVT.changeTypeToInteger();
5857
5858 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
5859
5860 assert(M->getNumValues() == 2 || M->getNumValues() == 3);
5861 bool IsTFE = M->getNumValues() == 3;
5862
5863 unsigned Opc;
5864 if (IsFormat) {
5867 } else {
5868 // TODO: Support non-format TFE loads.
5869 if (IsTFE)
5870 return SDValue();
5872 }
5873
5874 if (IsD16) {
5875 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops);
5876 }
5877
5878 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
5879 if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32)
5880 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M->getMemOperand());
5881
5882 if (isTypeLegal(LoadVT)) {
5883 return getMemIntrinsicNode(Opc, DL, M->getVTList(), Ops, IntVT,
5884 M->getMemOperand(), DAG);
5885 }
5886
5887 EVT CastVT = getEquivalentMemType(*DAG.getContext(), LoadVT);
5888 SDVTList VTList = DAG.getVTList(CastVT, MVT::Other);
5889 SDValue MemNode = getMemIntrinsicNode(Opc, DL, VTList, Ops, CastVT,
5890 M->getMemOperand(), DAG);
5891 return DAG.getMergeValues(
5892 {DAG.getNode(ISD::BITCAST, DL, LoadVT, MemNode), MemNode.getValue(1)},
5893 DL);
5894}
5895
5897 SDNode *N, SelectionDAG &DAG) {
5898 EVT VT = N->getValueType(0);
5899 unsigned CondCode = N->getConstantOperandVal(3);
5900 if (!ICmpInst::isIntPredicate(static_cast<ICmpInst::Predicate>(CondCode)))
5901 return DAG.getUNDEF(VT);
5902
5903 ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
5904
5905 SDValue LHS = N->getOperand(1);
5906 SDValue RHS = N->getOperand(2);
5907
5908 SDLoc DL(N);
5909
5910 EVT CmpVT = LHS.getValueType();
5911 if (CmpVT == MVT::i16 && !TLI.isTypeLegal(MVT::i16)) {
5912 unsigned PromoteOp = ICmpInst::isSigned(IcInput) ?
5914 LHS = DAG.getNode(PromoteOp, DL, MVT::i32, LHS);
5915 RHS = DAG.getNode(PromoteOp, DL, MVT::i32, RHS);
5916 }
5917
5918 ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
5919
5920 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
5921 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
5922
5923 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, DL, CCVT, LHS, RHS,
5924 DAG.getCondCode(CCOpcode));
5925 if (VT.bitsEq(CCVT))
5926 return SetCC;
5927 return DAG.getZExtOrTrunc(SetCC, DL, VT);
5928}
5929
5931 SDNode *N, SelectionDAG &DAG) {
5932 EVT VT = N->getValueType(0);
5933
5934 unsigned CondCode = N->getConstantOperandVal(3);
5935 if (!FCmpInst::isFPPredicate(static_cast<FCmpInst::Predicate>(CondCode)))
5936 return DAG.getUNDEF(VT);
5937
5938 SDValue Src0 = N->getOperand(1);
5939 SDValue Src1 = N->getOperand(2);
5940 EVT CmpVT = Src0.getValueType();
5941 SDLoc SL(N);
5942
5943 if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) {
5944 Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
5945 Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
5946 }
5947
5948 FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
5949 ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
5950 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
5951 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
5952 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, SL, CCVT, Src0,
5953 Src1, DAG.getCondCode(CCOpcode));
5954 if (VT.bitsEq(CCVT))
5955 return SetCC;
5956 return DAG.getZExtOrTrunc(SetCC, SL, VT);
5957}
5958
5960 SelectionDAG &DAG) {
5961 EVT VT = N->getValueType(0);
5962 SDValue Src = N->getOperand(1);
5963 SDLoc SL(N);
5964
5965 if (Src.getOpcode() == ISD::SETCC) {
5966 // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...)
5967 return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src.getOperand(0),
5968 Src.getOperand(1), Src.getOperand(2));
5969 }
5970 if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Src)) {
5971 // (ballot 0) -> 0
5972 if (Arg->isZero())
5973 return DAG.getConstant(0, SL, VT);
5974
5975 // (ballot 1) -> EXEC/EXEC_LO
5976 if (Arg->isOne()) {
5977 Register Exec;
5978 if (VT.getScalarSizeInBits() == 32)
5979 Exec = AMDGPU::EXEC_LO;
5980 else if (VT.getScalarSizeInBits() == 64)
5981 Exec = AMDGPU::EXEC;
5982 else
5983 return SDValue();
5984
5985 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, Exec, VT);
5986 }
5987 }
5988
5989 // (ballot (i1 $src)) -> (AMDGPUISD::SETCC (i32 (zext $src)) (i32 0)
5990 // ISD::SETNE)
5991 return DAG.getNode(
5992 AMDGPUISD::SETCC, SL, VT, DAG.getZExtOrTrunc(Src, SL, MVT::i32),
5993 DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE));
5994}
5995
5998 SelectionDAG &DAG) const {
5999 switch (N->getOpcode()) {
6001 if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
6002 Results.push_back(Res);
6003 return;
6004 }
6006 if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
6007 Results.push_back(Res);
6008 return;
6009 }
6011 unsigned IID = N->getConstantOperandVal(0);
6012 switch (IID) {
6013 case Intrinsic::amdgcn_make_buffer_rsrc:
6014 Results.push_back(lowerPointerAsRsrcIntrin(N, DAG));
6015 return;
6016 case Intrinsic::amdgcn_cvt_pkrtz: {
6017 SDValue Src0 = N->getOperand(1);
6018 SDValue Src1 = N->getOperand(2);
6019 SDLoc SL(N);
6020 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32,
6021 Src0, Src1);
6022 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
6023 return;
6024 }
6025 case Intrinsic::amdgcn_cvt_pknorm_i16:
6026 case Intrinsic::amdgcn_cvt_pknorm_u16:
6027 case Intrinsic::amdgcn_cvt_pk_i16:
6028 case Intrinsic::amdgcn_cvt_pk_u16: {
6029 SDValue Src0 = N->getOperand(1);
6030 SDValue Src1 = N->getOperand(2);
6031 SDLoc SL(N);
6032 unsigned Opcode;
6033
6034 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
6036 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
6038 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
6040 else
6042
6043 EVT VT = N->getValueType(0);
6044 if (isTypeLegal(VT))
6045 Results.push_back(DAG.getNode(Opcode, SL, VT, Src0, Src1));
6046 else {
6047 SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
6048 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
6049 }
6050 return;
6051 }
6052 case Intrinsic::amdgcn_s_buffer_load: {
6053 // Lower llvm.amdgcn.s.buffer.load.(i8, u8) intrinsics. First, we generate
6054 // s_buffer_load_u8 for signed and unsigned load instructions. Next, DAG
6055 // combiner tries to merge the s_buffer_load_u8 with a sext instruction
6056 // (performSignExtendInRegCombine()) and it replaces s_buffer_load_u8 with
6057 // s_buffer_load_i8.
6058 if (!Subtarget->hasScalarSubwordLoads())
6059 return;
6060 SDValue Op = SDValue(N, 0);
6061 SDValue Rsrc = Op.getOperand(1);
6062 SDValue Offset = Op.getOperand(2);
6063 SDValue CachePolicy = Op.getOperand(3);
6064 EVT VT = Op.getValueType();
6065 assert(VT == MVT::i8 && "Expected 8-bit s_buffer_load intrinsics.\n");
6066 SDLoc DL(Op);
6068 const DataLayout &DataLayout = DAG.getDataLayout();
6069 Align Alignment =
6075 VT.getStoreSize(), Alignment);
6076 SDValue LoadVal;
6077 if (!Offset->isDivergent()) {
6078 SDValue Ops[] = {Rsrc, // source register
6079 Offset, CachePolicy};
6080 SDValue BufferLoad =
6082 DAG.getVTList(MVT::i32), Ops, VT, MMO);
6083 LoadVal = DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
6084 } else {
6085 SDValue Ops[] = {
6086 DAG.getEntryNode(), // Chain
6087 Rsrc, // rsrc
6088 DAG.getConstant(0, DL, MVT::i32), // vindex
6089 {}, // voffset
6090 {}, // soffset
6091 {}, // offset
6092 CachePolicy, // cachepolicy
6093 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
6094 };
6095 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
6096 LoadVal = handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
6097 }
6098 Results.push_back(LoadVal);
6099 return;
6100 }
6101 }
6102 break;
6103 }
6105 if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
6106 if (Res.getOpcode() == ISD::MERGE_VALUES) {
6107 // FIXME: Hacky
6108 for (unsigned I = 0; I < Res.getNumOperands(); I++) {
6109 Results.push_back(Res.getOperand(I));
6110 }
6111 } else {
6112 Results.push_back(Res);
6113 Results.push_back(Res.getValue(1));
6114 }
6115 return;
6116 }
6117
6118 break;
6119 }
6120 case ISD::SELECT: {
6121 SDLoc SL(N);
6122 EVT VT = N->getValueType(0);
6123 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
6124 SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
6125 SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
6126
6127 EVT SelectVT = NewVT;
6128 if (NewVT.bitsLT(MVT::i32)) {
6129 LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
6130 RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
6131 SelectVT = MVT::i32;
6132 }
6133
6134 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, SelectVT,
6135 N->getOperand(0), LHS, RHS);
6136
6137 if (NewVT != SelectVT)
6138 NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect);
6139 Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
6140 return;
6141 }
6142 case ISD::FNEG: {
6143 if (N->getValueType(0) != MVT::v2f16)
6144 break;
6145
6146 SDLoc SL(N);
6147 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
6148
6149 SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32,
6150 BC,
6151 DAG.getConstant(0x80008000, SL, MVT::i32));
6152 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
6153 return;
6154 }
6155 case ISD::FABS: {
6156 if (N->getValueType(0) != MVT::v2f16)
6157 break;
6158
6159 SDLoc SL(N);
6160 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
6161
6162 SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32,
6163 BC,
6164 DAG.getConstant(0x7fff7fff, SL, MVT::i32));
6165 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
6166 return;
6167 }
6168 case ISD::FSQRT: {
6169 if (N->getValueType(0) != MVT::f16)
6170 break;
6171 Results.push_back(lowerFSQRTF16(SDValue(N, 0), DAG));
6172 break;
6173 }
6174 default:
6176 break;
6177 }
6178}
6179
6180/// Helper function for LowerBRCOND
6181static SDNode *findUser(SDValue Value, unsigned Opcode) {
6182
6183 SDNode *Parent = Value.getNode();
6184 for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end();
6185 I != E; ++I) {
6186
6187 if (I.getUse().get() != Value)
6188 continue;
6189
6190 if (I->getOpcode() == Opcode)
6191 return *I;
6192 }
6193 return nullptr;
6194}
6195
6196unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
6197 if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
6198 switch (Intr->getConstantOperandVal(1)) {
6199 case Intrinsic::amdgcn_if:
6200 return AMDGPUISD::IF;
6201 case Intrinsic::amdgcn_else:
6202 return AMDGPUISD::ELSE;
6203 case Intrinsic::amdgcn_loop:
6204 return AMDGPUISD::LOOP;
6205 case Intrinsic::amdgcn_end_cf:
6206 llvm_unreachable("should not occur");
6207 default:
6208 return 0;
6209 }
6210 }
6211
6212 // break, if_break, else_break are all only used as inputs to loop, not
6213 // directly as branch conditions.
6214 return 0;
6215}
6216
6218 const Triple &TT = getTargetMachine().getTargetTriple();
6222}
6223
6225 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
6226 return false;
6227
6228 // FIXME: Either avoid relying on address space here or change the default
6229 // address space for functions to avoid the explicit check.
6230 return (GV->getValueType()->isFunctionTy() ||
6233}
6234
6236 return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
6237}
6238
6240 if (!GV->hasExternalLinkage())
6241 return true;
6242
6243 const auto OS = getTargetMachine().getTargetTriple().getOS();
6244 return OS == Triple::AMDHSA || OS == Triple::AMDPAL;
6245}
6246
6247/// This transforms the control flow intrinsics to get the branch destination as
6248/// last parameter, also switches branch target with BR if the need arise
6249SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
6250 SelectionDAG &DAG) const {
6251 SDLoc DL(BRCOND);
6252
6253 SDNode *Intr = BRCOND.getOperand(1).getNode();
6254 SDValue Target = BRCOND.getOperand(2);
6255 SDNode *BR = nullptr;
6256 SDNode *SetCC = nullptr;
6257
6258 if (Intr->getOpcode() == ISD::SETCC) {
6259 // As long as we negate the condition everything is fine
6260 SetCC = Intr;
6261 Intr = SetCC->getOperand(0).getNode();
6262
6263 } else {
6264 // Get the target from BR if we don't negate the condition
6265 BR = findUser(BRCOND, ISD::BR);
6266 assert(BR && "brcond missing unconditional branch user");
6267 Target = BR->getOperand(1);
6268 }
6269
6270 unsigned CFNode = isCFIntrinsic(Intr);
6271 if (CFNode == 0) {
6272 // This is a uniform branch so we don't need to legalize.
6273 return BRCOND;
6274 }
6275
6276 bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
6277 Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN;
6278
6279 assert(!SetCC ||
6280 (SetCC->getConstantOperandVal(1) == 1 &&
6281 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
6282 ISD::SETNE));
6283
6284 // operands of the new intrinsic call
6286 if (HaveChain)
6287 Ops.push_back(BRCOND.getOperand(0));
6288
6289 Ops.append(Intr->op_begin() + (HaveChain ? 2 : 1), Intr->op_end());
6290 Ops.push_back(Target);
6291
6292 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
6293
6294 // build the new intrinsic call
6295 SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
6296
6297 if (!HaveChain) {
6298 SDValue Ops[] = {
6299 SDValue(Result, 0),
6300 BRCOND.getOperand(0)
6301 };
6302
6303 Result = DAG.getMergeValues(Ops, DL).getNode();
6304 }
6305
6306 if (BR) {
6307 // Give the branch instruction our target
6308 SDValue Ops[] = {
6309 BR->getOperand(0),
6310 BRCOND.getOperand(2)
6311 };
6312 SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
6313 DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
6314 }
6315
6316 SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
6317
6318 // Copy the intrinsic results to registers
6319 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
6321 if (!CopyToReg)
6322 continue;
6323
6324 Chain = DAG.getCopyToReg(
6325 Chain, DL,
6326 CopyToReg->getOperand(1),
6327 SDValue(Result, i - 1),
6328 SDValue());
6329
6330 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
6331 }
6332
6333 // Remove the old intrinsic from the chain
6335 SDValue(Intr, Intr->getNumValues() - 1),
6336 Intr->getOperand(0));
6337
6338 return Chain;
6339}
6340
6341SDValue SITargetLowering::LowerRETURNADDR(SDValue Op,
6342 SelectionDAG &DAG) const {
6343 MVT VT = Op.getSimpleValueType();
6344 SDLoc DL(Op);
6345 // Checking the depth
6346 if (Op.getConstantOperandVal(0) != 0)
6347 return DAG.getConstant(0, DL, VT);
6348
6351 // Check for kernel and shader functions
6352 if (Info->isEntryFunction())
6353 return DAG.getConstant(0, DL, VT);
6354
6355 MachineFrameInfo &MFI = MF.getFrameInfo();
6356 // There is a call to @llvm.returnaddress in this function
6357 MFI.setReturnAddressIsTaken(true);
6358
6360 // Get the return address reg and mark it as an implicit live-in
6361 Register Reg = MF.addLiveIn(TRI->getReturnAddressReg(MF), getRegClassFor(VT, Op.getNode()->isDivergent()));
6362
6363 return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
6364}
6365
6366SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG,
6367 SDValue Op,
6368 const SDLoc &DL,
6369 EVT VT) const {
6370 return Op.getValueType().bitsLE(VT) ?
6371 DAG.getNode(ISD::FP_EXTEND, DL, VT, Op) :
6372 DAG.getNode(ISD::FP_ROUND, DL, VT, Op,
6373 DAG.getTargetConstant(0, DL, MVT::i32));
6374}
6375
6376SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
6377 assert(Op.getValueType() == MVT::f16 &&
6378 "Do not know how to custom lower FP_ROUND for non-f16 type");
6379
6380 SDValue Src = Op.getOperand(0);
6381 EVT SrcVT = Src.getValueType();
6382 if (SrcVT != MVT::f64)
6383 return Op;
6384
6385 // TODO: Handle strictfp
6386 if (Op.getOpcode() != ISD::FP_ROUND)
6387 return Op;
6388
6389 SDLoc DL(Op);
6390
6391 SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
6392 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
6393 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
6394}
6395
6396SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
6397 SelectionDAG &DAG) const {
6398 EVT VT = Op.getValueType();
6399 const MachineFunction &MF = DAG.getMachineFunction();
6401 bool IsIEEEMode = Info->getMode().IEEE;
6402
6403 // FIXME: Assert during selection that this is only selected for
6404 // ieee_mode. Currently a combine can produce the ieee version for non-ieee
6405 // mode functions, but this happens to be OK since it's only done in cases
6406 // where there is known no sNaN.
6407 if (IsIEEEMode)
6408 return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
6409
6410 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
6411 VT == MVT::v16bf16)
6412 return splitBinaryVectorOp(Op, DAG);
6413 return Op;
6414}
6415
6416SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const {
6417 bool IsStrict = Op.getOpcode() == ISD::STRICT_FLDEXP;
6418 EVT VT = Op.getValueType();
6419 assert(VT == MVT::f16);
6420
6421 SDValue Exp = Op.getOperand(IsStrict ? 2 : 1);
6422 EVT ExpVT = Exp.getValueType();
6423 if (ExpVT == MVT::i16)
6424 return Op;
6425
6426 SDLoc DL(Op);
6427
6428 // Correct the exponent type for f16 to i16.
6429 // Clamp the range of the exponent to the instruction's range.
6430
6431 // TODO: This should be a generic narrowing legalization, and can easily be
6432 // for GlobalISel.
6433
6434 SDValue MinExp = DAG.getConstant(minIntN(16), DL, ExpVT);
6435 SDValue ClampMin = DAG.getNode(ISD::SMAX, DL, ExpVT, Exp, MinExp);
6436
6437 SDValue MaxExp = DAG.getConstant(maxIntN(16), DL, ExpVT);
6438 SDValue Clamp = DAG.getNode(ISD::SMIN, DL, ExpVT, ClampMin, MaxExp);
6439
6440 SDValue TruncExp = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Clamp);
6441
6442 if (IsStrict) {
6443 return DAG.getNode(ISD::STRICT_FLDEXP, DL, {VT, MVT::Other},
6444 {Op.getOperand(0), Op.getOperand(1), TruncExp});
6445 }
6446
6447 return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(0), TruncExp);
6448}
6449
6450// Custom lowering for vector multiplications and s_mul_u64.
6451SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const {
6452 EVT VT = Op.getValueType();
6453
6454 // Split vector operands.
6455 if (VT.isVector())
6456 return splitBinaryVectorOp(Op, DAG);
6457
6458 assert(VT == MVT::i64 && "The following code is a special for s_mul_u64");
6459
6460 // There are four ways to lower s_mul_u64:
6461 //
6462 // 1. If all the operands are uniform, then we lower it as it is.
6463 //
6464 // 2. If the operands are divergent, then we have to split s_mul_u64 in 32-bit
6465 // multiplications because there is not a vector equivalent of s_mul_u64.
6466 //
6467 // 3. If the cost model decides that it is more efficient to use vector
6468 // registers, then we have to split s_mul_u64 in 32-bit multiplications.
6469 // This happens in splitScalarSMULU64() in SIInstrInfo.cpp .
6470 //
6471 // 4. If the cost model decides to use vector registers and both of the
6472 // operands are zero-extended/sign-extended from 32-bits, then we split the
6473 // s_mul_u64 in two 32-bit multiplications. The problem is that it is not
6474 // possible to check if the operands are zero-extended or sign-extended in
6475 // SIInstrInfo.cpp. For this reason, here, we replace s_mul_u64 with
6476 // s_mul_u64_u32_pseudo if both operands are zero-extended and we replace
6477 // s_mul_u64 with s_mul_i64_i32_pseudo if both operands are sign-extended.
6478 // If the cost model decides that we have to use vector registers, then
6479 // splitScalarSMulPseudo() (in SIInstrInfo.cpp) split s_mul_u64_u32/
6480 // s_mul_i64_i32_pseudo in two vector multiplications. If the cost model
6481 // decides that we should use scalar registers, then s_mul_u64_u32_pseudo/
6482 // s_mul_i64_i32_pseudo is lowered as s_mul_u64 in expandPostRAPseudo() in
6483 // SIInstrInfo.cpp .
6484
6485 if (Op->isDivergent())
6486 return SDValue();
6487
6488 SDValue Op0 = Op.getOperand(0);
6489 SDValue Op1 = Op.getOperand(1);
6490 // If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64
6491 // with s_mul_u64_u32_pseudo. If all the operands are sign-extended to
6492 // 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo.
6493 KnownBits Op0KnownBits = DAG.computeKnownBits(Op0);
6494 unsigned Op0LeadingZeros = Op0KnownBits.countMinLeadingZeros();
6495 KnownBits Op1KnownBits = DAG.computeKnownBits(Op1);
6496 unsigned Op1LeadingZeros = Op1KnownBits.countMinLeadingZeros();
6497 SDLoc SL(Op);
6498 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
6499 return SDValue(
6500 DAG.getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
6501 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op0);
6502 unsigned Op1SignBits = DAG.ComputeNumSignBits(Op1);
6503 if (Op0SignBits >= 33 && Op1SignBits >= 33)
6504 return SDValue(
6505 DAG.getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
6506 // If all the operands are uniform, then we lower s_mul_u64 as it is.
6507 return Op;
6508}
6509
6510SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const {
6511 EVT VT = Op.getValueType();
6512 SDLoc SL(Op);
6513 SDValue LHS = Op.getOperand(0);
6514 SDValue RHS = Op.getOperand(1);
6515 bool isSigned = Op.getOpcode() == ISD::SMULO;
6516
6517 if (ConstantSDNode *RHSC = isConstOrConstSplat(RHS)) {
6518 const APInt &C = RHSC->getAPIntValue();
6519 // mulo(X, 1 << S) -> { X << S, (X << S) >> S != X }
6520 if (C.isPowerOf2()) {
6521 // smulo(x, signed_min) is same as umulo(x, signed_min).
6522 bool UseArithShift = isSigned && !C.isMinSignedValue();
6523 SDValue ShiftAmt = DAG.getConstant(C.logBase2(), SL, MVT::i32);
6524 SDValue Result = DAG.getNode(ISD::SHL, SL, VT, LHS, ShiftAmt);
6525 SDValue Overflow = DAG.getSetCC(SL, MVT::i1,
6526 DAG.getNode(UseArithShift ? ISD::SRA : ISD::SRL,
6527 SL, VT, Result, ShiftAmt),
6528 LHS, ISD::SETNE);
6529 return DAG.getMergeValues({ Result, Overflow }, SL);
6530 }
6531 }
6532
6533 SDValue Result = DAG.getNode(ISD::MUL, SL, VT, LHS, RHS);
6535 SL, VT, LHS, RHS);
6536
6537 SDValue Sign = isSigned
6538 ? DAG.getNode(ISD::SRA, SL, VT, Result,
6539 DAG.getConstant(VT.getScalarSizeInBits() - 1, SL, MVT::i32))
6540 : DAG.getConstant(0, SL, VT);
6541 SDValue Overflow = DAG.getSetCC(SL, MVT::i1, Top, Sign, ISD::SETNE);
6542
6543 return DAG.getMergeValues({ Result, Overflow }, SL);
6544}
6545
6546SDValue SITargetLowering::lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const {
6547 if (Op->isDivergent()) {
6548 // Select to V_MAD_[IU]64_[IU]32.
6549 return Op;
6550 }
6551 if (Subtarget->hasSMulHi()) {
6552 // Expand to S_MUL_I32 + S_MUL_HI_[IU]32.
6553 return SDValue();
6554 }
6555 // The multiply is uniform but we would have to use V_MUL_HI_[IU]32 to
6556 // calculate the high part, so we might as well do the whole thing with
6557 // V_MAD_[IU]64_[IU]32.
6558 return Op;
6559}
6560
6561SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
6562 if (!Subtarget->isTrapHandlerEnabled() ||
6564 return lowerTrapEndpgm(Op, DAG);
6565
6566 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(Op, DAG) :
6567 lowerTrapHsaQueuePtr(Op, DAG);
6568}
6569
6570SDValue SITargetLowering::lowerTrapEndpgm(
6571 SDValue Op, SelectionDAG &DAG) const {
6572 SDLoc SL(Op);
6573 SDValue Chain = Op.getOperand(0);
6574 return DAG.getNode(AMDGPUISD::ENDPGM_TRAP, SL, MVT::Other, Chain);
6575}
6576
6577SDValue SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT,
6578 const SDLoc &DL, Align Alignment, ImplicitParameter Param) const {
6581 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), Offset);
6583 return DAG.getLoad(VT, DL, DAG.getEntryNode(), Ptr, PtrInfo, Alignment,
6586}
6587
6588SDValue SITargetLowering::lowerTrapHsaQueuePtr(
6589 SDValue Op, SelectionDAG &DAG) const {
6590 SDLoc SL(Op);
6591 SDValue Chain = Op.getOperand(0);
6592
6593 SDValue QueuePtr;
6594 // For code object version 5, QueuePtr is passed through implicit kernarg.
6595 const Module *M = DAG.getMachineFunction().getFunction().getParent();
6597 QueuePtr =
6598 loadImplicitKernelArgument(DAG, MVT::i64, SL, Align(8), QUEUE_PTR);
6599 } else {
6602 Register UserSGPR = Info->getQueuePtrUserSGPR();
6603
6604 if (UserSGPR == AMDGPU::NoRegister) {
6605 // We probably are in a function incorrectly marked with
6606 // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the
6607 // trap, so just use a null pointer.
6608 QueuePtr = DAG.getConstant(0, SL, MVT::i64);
6609 } else {
6610 QueuePtr = CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR,
6611 MVT::i64);
6612 }
6613 }
6614
6615 SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
6616 SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01,
6617 QueuePtr, SDValue());
6618
6620 SDValue Ops[] = {
6621 ToReg,
6622 DAG.getTargetConstant(TrapID, SL, MVT::i16),
6623 SGPR01,
6624 ToReg.getValue(1)
6625 };
6626 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
6627}
6628
6629SDValue SITargetLowering::lowerTrapHsa(
6630 SDValue Op, SelectionDAG &DAG) const {
6631 SDLoc SL(Op);
6632 SDValue Chain = Op.getOperand(0);
6633
6634 // We need to simulate the 's_trap 2' instruction on targets that run in
6635 // PRIV=1 (where it is treated as a nop).
6636 if (Subtarget->hasPrivEnabledTrap2NopBug())
6637 return DAG.getNode(AMDGPUISD::SIMULATED_TRAP, SL, MVT::Other, Chain);
6638
6640 SDValue Ops[] = {
6641 Chain,
6642 DAG.getTargetConstant(TrapID, SL, MVT::i16)
6643 };
6644 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
6645}
6646
6647SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
6648 SDLoc SL(Op);
6649 SDValue Chain = Op.getOperand(0);
6651
6652 if (!Subtarget->isTrapHandlerEnabled() ||
6655 "debugtrap handler not supported",
6656 Op.getDebugLoc(),
6657 DS_Warning);
6658 LLVMContext &Ctx = MF.getFunction().getContext();
6659 Ctx.diagnose(NoTrap);
6660 return Chain;
6661 }
6662
6664 SDValue Ops[] = {
6665 Chain,
6666 DAG.getTargetConstant(TrapID, SL, MVT::i16)
6667 };
6668 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
6669}
6670
6671SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
6672 SelectionDAG &DAG) const {
6673 if (Subtarget->hasApertureRegs()) {
6674 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
6675 ? AMDGPU::SRC_SHARED_BASE
6676 : AMDGPU::SRC_PRIVATE_BASE;
6677 // Note: this feature (register) is broken. When used as a 32-bit operand,
6678 // it returns a wrong value (all zeroes?). The real value is in the upper 32
6679 // bits.
6680 //
6681 // To work around the issue, directly emit a 64 bit mov from this register
6682 // then extract the high bits. Note that this shouldn't even result in a
6683 // shift being emitted and simply become a pair of registers (e.g.):
6684 // s_mov_b64 s[6:7], src_shared_base
6685 // v_mov_b32_e32 v1, s7
6686 //
6687 // FIXME: It would be more natural to emit a CopyFromReg here, but then copy
6688 // coalescing would kick in and it would think it's okay to use the "HI"
6689 // subregister directly (instead of extracting the HI 32 bits) which is an
6690 // artificial (unusable) register.
6691 // Register TableGen definitions would need an overhaul to get rid of the
6692 // artificial "HI" aperture registers and prevent this kind of issue from
6693 // happening.
6694 SDNode *Mov = DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64,
6695 DAG.getRegister(ApertureRegNo, MVT::i64));
6696 return DAG.getNode(
6697 ISD::TRUNCATE, DL, MVT::i32,
6698 DAG.getNode(ISD::SRL, DL, MVT::i64,
6699 {SDValue(Mov, 0), DAG.getConstant(32, DL, MVT::i64)}));
6700 }
6701
6702 // For code object version 5, private_base and shared_base are passed through
6703 // implicit kernargs.
6704 const Module *M = DAG.getMachineFunction().getFunction().getParent();
6708 return loadImplicitKernelArgument(DAG, MVT::i32, DL, Align(4), Param);
6709 }
6710
6713 Register UserSGPR = Info->getQueuePtrUserSGPR();
6714 if (UserSGPR == AMDGPU::NoRegister) {
6715 // We probably are in a function incorrectly marked with
6716 // amdgpu-no-queue-ptr. This is undefined.
6717 return DAG.getUNDEF(MVT::i32);
6718 }
6719
6720 SDValue QueuePtr = CreateLiveInRegister(
6721 DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
6722
6723 // Offset into amd_queue_t for group_segment_aperture_base_hi /
6724 // private_segment_aperture_base_hi.
6725 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
6726
6727 SDValue Ptr =
6728 DAG.getObjectPtrOffset(DL, QueuePtr, TypeSize::getFixed(StructOffset));
6729
6730 // TODO: Use custom target PseudoSourceValue.
6731 // TODO: We should use the value from the IR intrinsic call, but it might not
6732 // be available and how do we get it?
6734 return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
6735 commonAlignment(Align(64), StructOffset),
6738}
6739
6740/// Return true if the value is a known valid address, such that a null check is
6741/// not necessary.
6743 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
6744 if (isa<FrameIndexSDNode>(Val) || isa<GlobalAddressSDNode>(Val) ||
6745 isa<BasicBlockSDNode>(Val))
6746 return true;
6747
6748 if (auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
6749 return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace);
6750
6751 // TODO: Search through arithmetic, handle arguments and loads
6752 // marked nonnull.
6753 return false;
6754}
6755
6756SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
6757 SelectionDAG &DAG) const {
6758 SDLoc SL(Op);
6759
6760 const AMDGPUTargetMachine &TM =
6761 static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
6762
6763 unsigned DestAS, SrcAS;
6764 SDValue Src;
6765 bool IsNonNull = false;
6766 if (const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(Op)) {
6767 SrcAS = ASC->getSrcAddressSpace();
6768 Src = ASC->getOperand(0);
6769 DestAS = ASC->getDestAddressSpace();
6770 } else {
6771 assert(Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
6772 Op.getConstantOperandVal(0) ==
6773 Intrinsic::amdgcn_addrspacecast_nonnull);
6774 Src = Op->getOperand(1);
6775 SrcAS = Op->getConstantOperandVal(2);
6776 DestAS = Op->getConstantOperandVal(3);
6777 IsNonNull = true;
6778 }
6779
6780 SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
6781
6782 // flat -> local/private
6783 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
6784 if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
6785 DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
6786 SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
6787
6788 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
6789 return Ptr;
6790
6791 unsigned NullVal = TM.getNullPointerValue(DestAS);
6792 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
6793 SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
6794
6795 return DAG.getNode(ISD::SELECT, SL, MVT::i32, NonNull, Ptr,
6796 SegmentNullPtr);
6797 }
6798 }
6799
6800 // local/private -> flat
6801 if (DestAS == AMDGPUAS::FLAT_ADDRESS) {
6802 if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
6803 SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
6804
6805 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
6806 SDValue CvtPtr =
6807 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
6808 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
6809
6810 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
6811 return CvtPtr;
6812
6813 unsigned NullVal = TM.getNullPointerValue(SrcAS);
6814 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
6815
6816 SDValue NonNull
6817 = DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
6818
6819 return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull, CvtPtr,
6820 FlatNullPtr);
6821 }
6822 }
6823
6824 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
6825 Op.getValueType() == MVT::i64) {
6828 SDValue Hi = DAG.getConstant(Info->get32BitAddressHighBits(), SL, MVT::i32);
6829 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Hi);
6830 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
6831 }
6832
6833 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
6834 Src.getValueType() == MVT::i64)
6835 return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
6836
6837 // global <-> flat are no-ops and never emitted.
6838
6839 const MachineFunction &MF = DAG.getMachineFunction();
6840 DiagnosticInfoUnsupported InvalidAddrSpaceCast(
6841 MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc());
6842 DAG.getContext()->diagnose(InvalidAddrSpaceCast);
6843
6844 return DAG.getUNDEF(Op->getValueType(0));
6845}
6846
6847// This lowers an INSERT_SUBVECTOR by extracting the individual elements from
6848// the small vector and inserting them into the big vector. That is better than
6849// the default expansion of doing it via a stack slot. Even though the use of
6850// the stack slot would be optimized away afterwards, the stack slot itself
6851// remains.
6852SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
6853 SelectionDAG &DAG) const {
6854 SDValue Vec = Op.getOperand(0);
6855 SDValue Ins = Op.getOperand(1);
6856 SDValue Idx = Op.getOperand(2);
6857 EVT VecVT = Vec.getValueType();
6858 EVT InsVT = Ins.getValueType();
6859 EVT EltVT = VecVT.getVectorElementType();
6860 unsigned InsNumElts = InsVT.getVectorNumElements();
6861 unsigned IdxVal = Idx->getAsZExtVal();
6862 SDLoc SL(Op);
6863
6864 if (EltVT.getScalarSizeInBits() == 16 && IdxVal % 2 == 0) {
6865 // Insert 32-bit registers at a time.
6866 assert(InsNumElts % 2 == 0 && "expect legal vector types");
6867
6868 unsigned VecNumElts = VecVT.getVectorNumElements();
6869 EVT NewVecVT =
6870 EVT::getVectorVT(*DAG.getContext(), MVT::i32, VecNumElts / 2);
6871 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
6873 MVT::i32, InsNumElts / 2);
6874
6875 Vec = DAG.getNode(ISD::BITCAST, SL, NewVecVT, Vec);
6876 Ins = DAG.getNode(ISD::BITCAST, SL, NewInsVT, Ins);
6877
6878 for (unsigned I = 0; I != InsNumElts / 2; ++I) {
6879 SDValue Elt;
6880 if (InsNumElts == 2) {
6881 Elt = Ins;
6882 } else {
6883 Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Ins,
6884 DAG.getConstant(I, SL, MVT::i32));
6885 }
6886 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, NewVecVT, Vec, Elt,
6887 DAG.getConstant(IdxVal / 2 + I, SL, MVT::i32));
6888 }
6889
6890 return DAG.getNode(ISD::BITCAST, SL, VecVT, Vec);
6891 }
6892
6893 for (unsigned I = 0; I != InsNumElts; ++I) {
6894 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Ins,
6895 DAG.getConstant(I, SL, MVT::i32));
6896 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, VecVT, Vec, Elt,
6897 DAG.getConstant(IdxVal + I, SL, MVT::i32));
6898 }
6899 return Vec;
6900}
6901
6902SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
6903 SelectionDAG &DAG) const {
6904 SDValue Vec = Op.getOperand(0);
6905 SDValue InsVal = Op.getOperand(1);
6906 SDValue Idx = Op.getOperand(2);
6907 EVT VecVT = Vec.getValueType();
6908 EVT EltVT = VecVT.getVectorElementType();
6909 unsigned VecSize = VecVT.getSizeInBits();
6910 unsigned EltSize = EltVT.getSizeInBits();
6911 SDLoc SL(Op);
6912
6913 // Specially handle the case of v4i16 with static indexing.
6914 unsigned NumElts = VecVT.getVectorNumElements();
6915 auto KIdx = dyn_cast<ConstantSDNode>(Idx);
6916 if (NumElts == 4 && EltSize == 16 && KIdx) {
6917 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
6918
6919 SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
6920 DAG.getConstant(0, SL, MVT::i32));
6921 SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
6922 DAG.getConstant(1, SL, MVT::i32));
6923
6924 SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
6925 SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
6926
6927 unsigned Idx = KIdx->getZExtValue();
6928 bool InsertLo = Idx < 2;
6929 SDValue InsHalf = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16,
6930 InsertLo ? LoVec : HiVec,
6931 DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
6932 DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
6933
6934 InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
6935
6936 SDValue Concat = InsertLo ?
6937 DAG.getBuildVector(MVT::v2i32, SL, { InsHalf, HiHalf }) :
6938 DAG.getBuildVector(MVT::v2i32, SL, { LoHalf, InsHalf });
6939
6940 return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
6941 }
6942
6943 // Static indexing does not lower to stack access, and hence there is no need
6944 // for special custom lowering to avoid stack access.
6945 if (isa<ConstantSDNode>(Idx))
6946 return SDValue();
6947
6948 // Avoid stack access for dynamic indexing by custom lowering to
6949 // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
6950
6951 assert(VecSize <= 64 && "Expected target vector size to be <= 64 bits");
6952
6953 MVT IntVT = MVT::getIntegerVT(VecSize);
6954
6955 // Convert vector index to bit-index and get the required bit mask.
6956 assert(isPowerOf2_32(EltSize));
6957 const auto EltMask = maskTrailingOnes<uint64_t>(EltSize);
6958 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
6959 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
6960 SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
6961 DAG.getConstant(EltMask, SL, IntVT), ScaledIdx);
6962
6963 // 1. Create a congruent vector with the target value in each element.
6964 SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT,
6965 DAG.getSplatBuildVector(VecVT, SL, InsVal));
6966
6967 // 2. Mask off all other indicies except the required index within (1).
6968 SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
6969
6970 // 3. Mask off the required index within the target vector.
6971 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
6972 SDValue RHS = DAG.getNode(ISD::AND, SL, IntVT,
6973 DAG.getNOT(SL, BFM, IntVT), BCVec);
6974
6975 // 4. Get (2) and (3) ORed into the target vector.
6976 SDValue BFI = DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS);
6977
6978 return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
6979}
6980
6981SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
6982 SelectionDAG &DAG) const {
6983 SDLoc SL(Op);
6984
6985 EVT ResultVT = Op.getValueType();
6986 SDValue Vec = Op.getOperand(0);
6987 SDValue Idx = Op.getOperand(1);
6988 EVT VecVT = Vec.getValueType();
6989 unsigned VecSize = VecVT.getSizeInBits();
6990 EVT EltVT = VecVT.getVectorElementType();
6991
6992 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
6993
6994 // Make sure we do any optimizations that will make it easier to fold
6995 // source modifiers before obscuring it with bit operations.
6996
6997 // XXX - Why doesn't this get called when vector_shuffle is expanded?
6998 if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
6999 return Combined;
7000
7001 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
7002 SDValue Lo, Hi;
7003 EVT LoVT, HiVT;
7004 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VecVT);
7005
7006 if (VecSize == 128) {
7007 SDValue V2 = DAG.getBitcast(MVT::v2i64, Vec);
7008 Lo = DAG.getBitcast(LoVT,
7009 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7010 DAG.getConstant(0, SL, MVT::i32)));
7011 Hi = DAG.getBitcast(HiVT,
7012 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7013 DAG.getConstant(1, SL, MVT::i32)));
7014 } else if (VecSize == 256) {
7015 SDValue V2 = DAG.getBitcast(MVT::v4i64, Vec);
7016 SDValue Parts[4];
7017 for (unsigned P = 0; P < 4; ++P) {
7018 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7019 DAG.getConstant(P, SL, MVT::i32));
7020 }
7021
7022 Lo = DAG.getBitcast(LoVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
7023 Parts[0], Parts[1]));
7024 Hi = DAG.getBitcast(HiVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
7025 Parts[2], Parts[3]));
7026 } else {
7027 assert(VecSize == 512);
7028
7029 SDValue V2 = DAG.getBitcast(MVT::v8i64, Vec);
7030 SDValue Parts[8];
7031 for (unsigned P = 0; P < 8; ++P) {
7032 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7033 DAG.getConstant(P, SL, MVT::i32));
7034 }
7035
7036 Lo = DAG.getBitcast(LoVT,
7037 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
7038 Parts[0], Parts[1], Parts[2], Parts[3]));
7039 Hi = DAG.getBitcast(HiVT,
7040 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
7041 Parts[4], Parts[5],Parts[6], Parts[7]));
7042 }
7043
7044 EVT IdxVT = Idx.getValueType();
7045 unsigned NElem = VecVT.getVectorNumElements();
7046 assert(isPowerOf2_32(NElem));
7047 SDValue IdxMask = DAG.getConstant(NElem / 2 - 1, SL, IdxVT);
7048 SDValue NewIdx = DAG.getNode(ISD::AND, SL, IdxVT, Idx, IdxMask);
7049 SDValue Half = DAG.getSelectCC(SL, Idx, IdxMask, Hi, Lo, ISD::SETUGT);
7050 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Half, NewIdx);
7051 }
7052
7053 assert(VecSize <= 64);
7054
7055 MVT IntVT = MVT::getIntegerVT(VecSize);
7056
7057 // If Vec is just a SCALAR_TO_VECTOR, then use the scalar integer directly.
7058 SDValue VecBC = peekThroughBitcasts(Vec);
7059 if (VecBC.getOpcode() == ISD::SCALAR_TO_VECTOR) {
7060 SDValue Src = VecBC.getOperand(0);
7061 Src = DAG.getBitcast(Src.getValueType().changeTypeToInteger(), Src);
7062 Vec = DAG.getAnyExtOrTrunc(Src, SL, IntVT);
7063 }
7064
7065 unsigned EltSize = EltVT.getSizeInBits();
7066 assert(isPowerOf2_32(EltSize));
7067
7068 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
7069
7070 // Convert vector index to bit-index (* EltSize)
7071 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
7072
7073 SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
7074 SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
7075
7076 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
7077 SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
7078 return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
7079 }
7080
7081 return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
7082}
7083
7084static bool elementPairIsContiguous(ArrayRef<int> Mask, int Elt) {
7085 assert(Elt % 2 == 0);
7086 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
7087}
7088
7089SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
7090 SelectionDAG &DAG) const {
7091 SDLoc SL(Op);
7092 EVT ResultVT = Op.getValueType();
7093 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
7094
7095 EVT PackVT = ResultVT.isInteger() ? MVT::v2i16 : MVT::v2f16;
7096 EVT EltVT = PackVT.getVectorElementType();
7097 int SrcNumElts = Op.getOperand(0).getValueType().getVectorNumElements();
7098
7099 // vector_shuffle <0,1,6,7> lhs, rhs
7100 // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2)
7101 //
7102 // vector_shuffle <6,7,2,3> lhs, rhs
7103 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2)
7104 //
7105 // vector_shuffle <6,7,0,1> lhs, rhs
7106 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0)
7107
7108 // Avoid scalarizing when both halves are reading from consecutive elements.
7110 for (int I = 0, N = ResultVT.getVectorNumElements(); I != N; I += 2) {
7111 if (elementPairIsContiguous(SVN->getMask(), I)) {
7112 const int Idx = SVN->getMaskElt(I);
7113 int VecIdx = Idx < SrcNumElts ? 0 : 1;
7114 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
7115 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL,
7116 PackVT, SVN->getOperand(VecIdx),
7117 DAG.getConstant(EltIdx, SL, MVT::i32));
7118 Pieces.push_back(SubVec);
7119 } else {
7120 const int Idx0 = SVN->getMaskElt(I);
7121 const int Idx1 = SVN->getMaskElt(I + 1);
7122 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
7123 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
7124 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
7125 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
7126
7127 SDValue Vec0 = SVN->getOperand(VecIdx0);
7128 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
7129 Vec0, DAG.getConstant(EltIdx0, SL, MVT::i32));
7130
7131 SDValue Vec1 = SVN->getOperand(VecIdx1);
7132 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
7133 Vec1, DAG.getConstant(EltIdx1, SL, MVT::i32));
7134 Pieces.push_back(DAG.getBuildVector(PackVT, SL, { Elt0, Elt1 }));
7135 }
7136 }
7137
7138 return DAG.getNode(ISD::CONCAT_VECTORS, SL, ResultVT, Pieces);
7139}
7140
7141SDValue SITargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op,
7142 SelectionDAG &DAG) const {
7143 SDValue SVal = Op.getOperand(0);
7144 EVT ResultVT = Op.getValueType();
7145 EVT SValVT = SVal.getValueType();
7146 SDValue UndefVal = DAG.getUNDEF(SValVT);
7147 SDLoc SL(Op);
7148
7150 VElts.push_back(SVal);
7151 for (int I = 1, E = ResultVT.getVectorNumElements(); I < E; ++I)
7152 VElts.push_back(UndefVal);
7153
7154 return DAG.getBuildVector(ResultVT, SL, VElts);
7155}
7156
7157SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
7158 SelectionDAG &DAG) const {
7159 SDLoc SL(Op);
7160 EVT VT = Op.getValueType();
7161
7162 if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
7163 VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
7165 VT.getVectorNumElements() / 2);
7166 MVT HalfIntVT = MVT::getIntegerVT(HalfVT.getSizeInBits());
7167
7168 // Turn into pair of packed build_vectors.
7169 // TODO: Special case for constants that can be materialized with s_mov_b64.
7170 SmallVector<SDValue, 4> LoOps, HiOps;
7171 for (unsigned I = 0, E = VT.getVectorNumElements() / 2; I != E; ++I) {
7172 LoOps.push_back(Op.getOperand(I));
7173 HiOps.push_back(Op.getOperand(I + E));
7174 }
7175 SDValue Lo = DAG.getBuildVector(HalfVT, SL, LoOps);
7176 SDValue Hi = DAG.getBuildVector(HalfVT, SL, HiOps);
7177
7178 SDValue CastLo = DAG.getNode(ISD::BITCAST, SL, HalfIntVT, Lo);
7179 SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, HalfIntVT, Hi);
7180
7181 SDValue Blend = DAG.getBuildVector(MVT::getVectorVT(HalfIntVT, 2), SL,
7182 { CastLo, CastHi });
7183 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
7184 }
7185
7186 if (VT == MVT::v16i16 || VT == MVT::v16f16 || VT == MVT::v16bf16) {
7188 VT.getVectorNumElements() / 4);
7189 MVT QuarterIntVT = MVT::getIntegerVT(QuarterVT.getSizeInBits());
7190
7191 SmallVector<SDValue, 4> Parts[4];
7192 for (unsigned I = 0, E = VT.getVectorNumElements() / 4; I != E; ++I) {
7193 for (unsigned P = 0; P < 4; ++P)
7194 Parts[P].push_back(Op.getOperand(I + P * E));
7195 }
7196 SDValue Casts[4];
7197 for (unsigned P = 0; P < 4; ++P) {
7198 SDValue Vec = DAG.getBuildVector(QuarterVT, SL, Parts[P]);
7199 Casts[P] = DAG.getNode(ISD::BITCAST, SL, QuarterIntVT, Vec);
7200 }
7201
7202 SDValue Blend =
7203 DAG.getBuildVector(MVT::getVectorVT(QuarterIntVT, 4), SL, Casts);
7204 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
7205 }
7206
7207 if (VT == MVT::v32i16 || VT == MVT::v32f16 || VT == MVT::v32bf16) {
7209 VT.getVectorNumElements() / 8);
7210 MVT QuarterIntVT = MVT::getIntegerVT(QuarterVT.getSizeInBits());
7211
7212 SmallVector<SDValue, 8> Parts[8];
7213 for (unsigned I = 0, E = VT.getVectorNumElements() / 8; I != E; ++I) {
7214 for (unsigned P = 0; P < 8; ++P)
7215 Parts[P].push_back(Op.getOperand(I + P * E));
7216 }
7217 SDValue Casts[8];
7218 for (unsigned P = 0; P < 8; ++P) {
7219 SDValue Vec = DAG.getBuildVector(QuarterVT, SL, Parts[P]);
7220 Casts[P] = DAG.getNode(ISD::BITCAST, SL, QuarterIntVT, Vec);
7221 }
7222
7223 SDValue Blend =
7224 DAG.getBuildVector(MVT::getVectorVT(QuarterIntVT, 8), SL, Casts);
7225 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
7226 }
7227
7228 assert(VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16);
7229 assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
7230
7231 SDValue Lo = Op.getOperand(0);
7232 SDValue Hi = Op.getOperand(1);
7233
7234 // Avoid adding defined bits with the zero_extend.
7235 if (Hi.isUndef()) {
7236 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
7237 SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo);
7238 return DAG.getNode(ISD::BITCAST, SL, VT, ExtLo);
7239 }
7240
7241 Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
7242 Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
7243
7244 SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
7245 DAG.getConstant(16, SL, MVT::i32));
7246 if (Lo.isUndef())
7247 return DAG.getNode(ISD::BITCAST, SL, VT, ShlHi);
7248
7249 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
7250 Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
7251
7252 SDValue Or = DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi);
7253 return DAG.getNode(ISD::BITCAST, SL, VT, Or);
7254}
7255
7256bool
7258 // OSes that use ELF REL relocations (instead of RELA) can only store a
7259 // 32-bit addend in the instruction, so it is not safe to allow offset folding
7260 // which can create arbitrary 64-bit addends. (This is only a problem for
7261 // R_AMDGPU_*32_HI relocations since other relocation types are unaffected by
7262 // the high 32 bits of the addend.)
7263 //
7264 // This should be kept in sync with how HasRelocationAddend is initialized in
7265 // the constructor of ELFAMDGPUAsmBackend.
7266 if (!Subtarget->isAmdHsaOS())
7267 return false;
7268
7269 // We can fold offsets for anything that doesn't require a GOT relocation.
7270 return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
7274}
7275
7276static SDValue
7278 const SDLoc &DL, int64_t Offset, EVT PtrVT,
7279 unsigned GAFlags = SIInstrInfo::MO_NONE) {
7280 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
7281 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
7282 // lowered to the following code sequence:
7283 //
7284 // For constant address space:
7285 // s_getpc_b64 s[0:1]
7286 // s_add_u32 s0, s0, $symbol
7287 // s_addc_u32 s1, s1, 0
7288 //
7289 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
7290 // a fixup or relocation is emitted to replace $symbol with a literal
7291 // constant, which is a pc-relative offset from the encoding of the $symbol
7292 // operand to the global variable.
7293 //
7294 // For global address space:
7295 // s_getpc_b64 s[0:1]
7296 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
7297 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
7298 //
7299 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
7300 // fixups or relocations are emitted to replace $symbol@*@lo and
7301 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
7302 // which is a 64-bit pc-relative offset from the encoding of the $symbol
7303 // operand to the global variable.
7304 SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags);
7305 SDValue PtrHi;
7306 if (GAFlags == SIInstrInfo::MO_NONE)
7307 PtrHi = DAG.getTargetConstant(0, DL, MVT::i32);
7308 else
7309 PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags + 1);
7310 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
7311}
7312
7313SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
7314 SDValue Op,
7315 SelectionDAG &DAG) const {
7316 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
7317 SDLoc DL(GSD);
7318 EVT PtrVT = Op.getValueType();
7319
7320 const GlobalValue *GV = GSD->getGlobal();
7326 GV->hasExternalLinkage()) {
7327 Type *Ty = GV->getValueType();
7328 // HIP uses an unsized array `extern __shared__ T s[]` or similar
7329 // zero-sized type in other languages to declare the dynamic shared
7330 // memory which size is not known at the compile time. They will be
7331 // allocated by the runtime and placed directly after the static
7332 // allocated ones. They all share the same offset.
7333 if (DAG.getDataLayout().getTypeAllocSize(Ty).isZero()) {
7334 assert(PtrVT == MVT::i32 && "32-bit pointer is expected.");
7335 // Adjust alignment for that dynamic shared memory array.
7337 MFI->setDynLDSAlign(F, *cast<GlobalVariable>(GV));
7338 MFI->setUsesDynamicLDS(true);
7339 return SDValue(
7340 DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0);
7341 }
7342 }
7344 }
7345
7347 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(),
7349 return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA);
7350 }
7351
7352 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
7353 SDValue AddrLo = DAG.getTargetGlobalAddress(
7354 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_LO);
7355 AddrLo = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrLo), 0};
7356
7357 SDValue AddrHi = DAG.getTargetGlobalAddress(
7358 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_HI);
7359 AddrHi = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrHi), 0};
7360
7361 return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, AddrLo, AddrHi);
7362 }
7363
7364 if (shouldEmitFixup(GV))
7365 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
7366
7367 if (shouldEmitPCReloc(GV))
7368 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
7370
7371 SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
7373
7374 Type *Ty = PtrVT.getTypeForEVT(*DAG.getContext());
7376 const DataLayout &DataLayout = DAG.getDataLayout();
7377 Align Alignment = DataLayout.getABITypeAlign(PtrTy);
7378 MachinePointerInfo PtrInfo
7380
7381 return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Alignment,
7384}
7385
7387 const SDLoc &DL, SDValue V) const {
7388 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
7389 // the destination register.
7390 //
7391 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
7392 // so we will end up with redundant moves to m0.
7393 //
7394 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
7395
7396 // A Null SDValue creates a glue result.
7397 SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
7398 V, Chain);
7399 return SDValue(M0, 0);
7400}
7401
7402SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG,
7403 SDValue Op,
7404 MVT VT,
7405 unsigned Offset) const {
7406 SDLoc SL(Op);
7407 SDValue Param = lowerKernargMemParameter(
7408 DAG, MVT::i32, MVT::i32, SL, DAG.getEntryNode(), Offset, Align(4), false);
7409 // The local size values will have the hi 16-bits as zero.
7410 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
7411 DAG.getValueType(VT));
7412}
7413
7415 EVT VT) {
7417 "non-hsa intrinsic with hsa target",
7418 DL.getDebugLoc());
7419 DAG.getContext()->diagnose(BadIntrin);
7420 return DAG.getUNDEF(VT);
7421}
7422
7424 EVT VT) {
7426 "intrinsic not supported on subtarget",
7427 DL.getDebugLoc());
7428 DAG.getContext()->diagnose(BadIntrin);
7429 return DAG.getUNDEF(VT);
7430}
7431
7433 ArrayRef<SDValue> Elts) {
7434 assert(!Elts.empty());
7435 MVT Type;
7436 unsigned NumElts = Elts.size();
7437
7438 if (NumElts <= 12) {
7439 Type = MVT::getVectorVT(MVT::f32, NumElts);
7440 } else {
7441 assert(Elts.size() <= 16);
7442 Type = MVT::v16f32;
7443 NumElts = 16;
7444 }
7445
7446 SmallVector<SDValue, 16> VecElts(NumElts);
7447 for (unsigned i = 0; i < Elts.size(); ++i) {
7448 SDValue Elt = Elts[i];
7449 if (Elt.getValueType() != MVT::f32)
7450 Elt = DAG.getBitcast(MVT::f32, Elt);
7451 VecElts[i] = Elt;
7452 }
7453 for (unsigned i = Elts.size(); i < NumElts; ++i)
7454 VecElts[i] = DAG.getUNDEF(MVT::f32);
7455
7456 if (NumElts == 1)
7457 return VecElts[0];
7458 return DAG.getBuildVector(Type, DL, VecElts);
7459}
7460
7461static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT,
7462 SDValue Src, int ExtraElts) {
7463 EVT SrcVT = Src.getValueType();
7464
7466
7467 if (SrcVT.isVector())
7468 DAG.ExtractVectorElements(Src, Elts);
7469 else
7470 Elts.push_back(Src);
7471
7472 SDValue Undef = DAG.getUNDEF(SrcVT.getScalarType());
7473 while (ExtraElts--)
7474 Elts.push_back(Undef);
7475
7476 return DAG.getBuildVector(CastVT, DL, Elts);
7477}
7478
7479// Re-construct the required return value for a image load intrinsic.
7480// This is more complicated due to the optional use TexFailCtrl which means the required
7481// return type is an aggregate
7483 ArrayRef<EVT> ResultTypes, bool IsTexFail,
7484 bool Unpacked, bool IsD16, int DMaskPop,
7485 int NumVDataDwords, bool IsAtomicPacked16Bit,
7486 const SDLoc &DL) {
7487 // Determine the required return type. This is the same regardless of IsTexFail flag
7488 EVT ReqRetVT = ResultTypes[0];
7489 int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
7490 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
7491 ? (ReqRetNumElts + 1) / 2
7492 : ReqRetNumElts;
7493
7494 int MaskPopDwords = (!IsD16 || (IsD16 && Unpacked)) ?
7495 DMaskPop : (DMaskPop + 1) / 2;
7496
7497 MVT DataDwordVT = NumDataDwords == 1 ?
7498 MVT::i32 : MVT::getVectorVT(MVT::i32, NumDataDwords);
7499
7500 MVT MaskPopVT = MaskPopDwords == 1 ?
7501 MVT::i32 : MVT::getVectorVT(MVT::i32, MaskPopDwords);
7502
7503 SDValue Data(Result, 0);
7504 SDValue TexFail;
7505
7506 if (DMaskPop > 0 && Data.getValueType() != MaskPopVT) {
7507 SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32);
7508 if (MaskPopVT.isVector()) {
7509 Data = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskPopVT,
7510 SDValue(Result, 0), ZeroIdx);
7511 } else {
7512 Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskPopVT,
7513 SDValue(Result, 0), ZeroIdx);
7514 }
7515 }
7516
7517 if (DataDwordVT.isVector() && !IsAtomicPacked16Bit)
7518 Data = padEltsToUndef(DAG, DL, DataDwordVT, Data,
7519 NumDataDwords - MaskPopDwords);
7520
7521 if (IsD16)
7522 Data = adjustLoadValueTypeImpl(Data, ReqRetVT, DL, DAG, Unpacked);
7523
7524 EVT LegalReqRetVT = ReqRetVT;
7525 if (!ReqRetVT.isVector()) {
7526 if (!Data.getValueType().isInteger())
7527 Data = DAG.getNode(ISD::BITCAST, DL,
7528 Data.getValueType().changeTypeToInteger(), Data);
7529 Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data);
7530 } else {
7531 // We need to widen the return vector to a legal type
7532 if ((ReqRetVT.getVectorNumElements() % 2) == 1 &&
7533 ReqRetVT.getVectorElementType().getSizeInBits() == 16) {
7534 LegalReqRetVT =
7536 ReqRetVT.getVectorNumElements() + 1);
7537 }
7538 }
7539 Data = DAG.getNode(ISD::BITCAST, DL, LegalReqRetVT, Data);
7540
7541 if (IsTexFail) {
7542 TexFail =
7543 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SDValue(Result, 0),
7544 DAG.getConstant(MaskPopDwords, DL, MVT::i32));
7545
7546 return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL);
7547 }
7548
7549 if (Result->getNumValues() == 1)
7550 return Data;
7551
7552 return DAG.getMergeValues({Data, SDValue(Result, 1)}, DL);
7553}
7554
7555static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
7556 SDValue *LWE, bool &IsTexFail) {
7557 auto TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.getNode());
7558
7559 uint64_t Value = TexFailCtrlConst->getZExtValue();
7560 if (Value) {
7561 IsTexFail = true;
7562 }
7563
7564 SDLoc DL(TexFailCtrlConst);
7565 *TFE = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
7566 Value &= ~(uint64_t)0x1;
7567 *LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
7568 Value &= ~(uint64_t)0x2;
7569
7570 return Value == 0;
7571}
7572
7574 MVT PackVectorVT,
7575 SmallVectorImpl<SDValue> &PackedAddrs,
7576 unsigned DimIdx, unsigned EndIdx,
7577 unsigned NumGradients) {
7578 SDLoc DL(Op);
7579 for (unsigned I = DimIdx; I < EndIdx; I++) {
7580 SDValue Addr = Op.getOperand(I);
7581
7582 // Gradients are packed with undef for each coordinate.
7583 // In <hi 16 bit>,<lo 16 bit> notation, the registers look like this:
7584 // 1D: undef,dx/dh; undef,dx/dv
7585 // 2D: dy/dh,dx/dh; dy/dv,dx/dv
7586 // 3D: dy/dh,dx/dh; undef,dz/dh; dy/dv,dx/dv; undef,dz/dv
7587 if (((I + 1) >= EndIdx) ||
7588 ((NumGradients / 2) % 2 == 1 && (I == DimIdx + (NumGradients / 2) - 1 ||
7589 I == DimIdx + NumGradients - 1))) {
7590 if (Addr.getValueType() != MVT::i16)
7591 Addr = DAG.getBitcast(MVT::i16, Addr);
7592 Addr = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Addr);
7593 } else {
7594 Addr = DAG.getBuildVector(PackVectorVT, DL, {Addr, Op.getOperand(I + 1)});
7595 I++;
7596 }
7597 Addr = DAG.getBitcast(MVT::f32, Addr);
7598 PackedAddrs.push_back(Addr);
7599 }
7600}
7601
7602SDValue SITargetLowering::lowerImage(SDValue Op,
7604 SelectionDAG &DAG, bool WithChain) const {
7605 SDLoc DL(Op);
7607 const GCNSubtarget* ST = &MF.getSubtarget<GCNSubtarget>();
7608 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
7610 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
7611 unsigned IntrOpcode = Intr->BaseOpcode;
7612 bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget);
7613 bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
7614 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
7615
7616 SmallVector<EVT, 3> ResultTypes(Op->values());
7617 SmallVector<EVT, 3> OrigResultTypes(Op->values());
7618 bool IsD16 = false;
7619 bool IsG16 = false;
7620 bool IsA16 = false;
7621 SDValue VData;
7622 int NumVDataDwords;
7623 bool AdjustRetType = false;
7624 bool IsAtomicPacked16Bit = false;
7625
7626 // Offset of intrinsic arguments
7627 const unsigned ArgOffset = WithChain ? 2 : 1;
7628
7629 unsigned DMask;
7630 unsigned DMaskLanes = 0;
7631
7632 if (BaseOpcode->Atomic) {
7633 VData = Op.getOperand(2);
7634
7635 IsAtomicPacked16Bit =
7636 (Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
7637 Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
7638
7639 bool Is64Bit = VData.getValueSizeInBits() == 64;
7640 if (BaseOpcode->AtomicX2) {
7641 SDValue VData2 = Op.getOperand(3);
7642 VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
7643 {VData, VData2});
7644 if (Is64Bit)
7645 VData = DAG.getBitcast(MVT::v4i32, VData);
7646
7647 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
7648 DMask = Is64Bit ? 0xf : 0x3;
7649 NumVDataDwords = Is64Bit ? 4 : 2;
7650 } else {
7651 DMask = Is64Bit ? 0x3 : 0x1;
7652 NumVDataDwords = Is64Bit ? 2 : 1;
7653 }
7654 } else {
7655 DMask = Op->getConstantOperandVal(ArgOffset + Intr->DMaskIndex);
7656 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
7657
7658 if (BaseOpcode->Store) {
7659 VData = Op.getOperand(2);
7660
7661 MVT StoreVT = VData.getSimpleValueType();
7662 if (StoreVT.getScalarType() == MVT::f16) {
7663 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
7664 return Op; // D16 is unsupported for this instruction
7665
7666 IsD16 = true;
7667 VData = handleD16VData(VData, DAG, true);
7668 }
7669
7670 NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
7671 } else {
7672 // Work out the num dwords based on the dmask popcount and underlying type
7673 // and whether packing is supported.
7674 MVT LoadVT = ResultTypes[0].getSimpleVT();
7675 if (LoadVT.getScalarType() == MVT::f16) {
7676 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
7677 return Op; // D16 is unsupported for this instruction
7678
7679 IsD16 = true;
7680 }
7681
7682 // Confirm that the return type is large enough for the dmask specified
7683 if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) ||
7684 (!LoadVT.isVector() && DMaskLanes > 1))
7685 return Op;
7686
7687 // The sq block of gfx8 and gfx9 do not estimate register use correctly
7688 // for d16 image_gather4, image_gather4_l, and image_gather4_lz
7689 // instructions.
7690 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
7691 !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))
7692 NumVDataDwords = (DMaskLanes + 1) / 2;
7693 else
7694 NumVDataDwords = DMaskLanes;
7695
7696 AdjustRetType = true;
7697 }
7698 }
7699
7700 unsigned VAddrEnd = ArgOffset + Intr->VAddrEnd;
7702
7703 // Check for 16 bit addresses or derivatives and pack if true.
7704 MVT VAddrVT =
7705 Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType();
7706 MVT VAddrScalarVT = VAddrVT.getScalarType();
7707 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
7708 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
7709
7710 VAddrVT = Op.getOperand(ArgOffset + Intr->CoordStart).getSimpleValueType();
7711 VAddrScalarVT = VAddrVT.getScalarType();
7712 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
7713 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
7714
7715 // Push back extra arguments.
7716 for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) {
7717 if (IsA16 && (Op.getOperand(ArgOffset + I).getValueType() == MVT::f16)) {
7718 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
7719 // Special handling of bias when A16 is on. Bias is of type half but
7720 // occupies full 32-bit.
7721 SDValue Bias = DAG.getBuildVector(
7722 MVT::v2f16, DL,
7723 {Op.getOperand(ArgOffset + I), DAG.getUNDEF(MVT::f16)});
7724 VAddrs.push_back(Bias);
7725 } else {
7726 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
7727 "Bias needs to be converted to 16 bit in A16 mode");
7728 VAddrs.push_back(Op.getOperand(ArgOffset + I));
7729 }
7730 }
7731
7732 if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) {
7733 // 16 bit gradients are supported, but are tied to the A16 control
7734 // so both gradients and addresses must be 16 bit
7735 LLVM_DEBUG(
7736 dbgs() << "Failed to lower image intrinsic: 16 bit addresses "
7737 "require 16 bit args for both gradients and addresses");
7738 return Op;
7739 }
7740
7741 if (IsA16) {
7742 if (!ST->hasA16()) {
7743 LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
7744 "support 16 bit addresses\n");
7745 return Op;
7746 }
7747 }
7748
7749 // We've dealt with incorrect input so we know that if IsA16, IsG16
7750 // are set then we have to compress/pack operands (either address,
7751 // gradient or both)
7752 // In the case where a16 and gradients are tied (no G16 support) then we
7753 // have already verified that both IsA16 and IsG16 are true
7754 if (BaseOpcode->Gradients && IsG16 && ST->hasG16()) {
7755 // Activate g16
7756 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
7758 IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16
7759 }
7760
7761 // Add gradients (packed or unpacked)
7762 if (IsG16) {
7763 // Pack the gradients
7764 // const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart);
7765 packImage16bitOpsToDwords(DAG, Op, GradPackVectorVT, VAddrs,
7766 ArgOffset + Intr->GradientStart,
7767 ArgOffset + Intr->CoordStart, Intr->NumGradients);
7768 } else {
7769 for (unsigned I = ArgOffset + Intr->GradientStart;
7770 I < ArgOffset + Intr->CoordStart; I++)
7771 VAddrs.push_back(Op.getOperand(I));
7772 }
7773
7774 // Add addresses (packed or unpacked)
7775 if (IsA16) {
7776 packImage16bitOpsToDwords(DAG, Op, AddrPackVectorVT, VAddrs,
7777 ArgOffset + Intr->CoordStart, VAddrEnd,
7778 0 /* No gradients */);
7779 } else {
7780 // Add uncompressed address
7781 for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++)
7782 VAddrs.push_back(Op.getOperand(I));
7783 }
7784
7785 // If the register allocator cannot place the address registers contiguously
7786 // without introducing moves, then using the non-sequential address encoding
7787 // is always preferable, since it saves VALU instructions and is usually a
7788 // wash in terms of code size or even better.
7789 //
7790 // However, we currently have no way of hinting to the register allocator that
7791 // MIMG addresses should be placed contiguously when it is possible to do so,
7792 // so force non-NSA for the common 2-address case as a heuristic.
7793 //
7794 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
7795 // allocation when possible.
7796 //
7797 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
7798 // set of the remaining addresses.
7799 const unsigned NSAMaxSize = ST->getNSAMaxSize(BaseOpcode->Sampler);
7800 const bool HasPartialNSAEncoding = ST->hasPartialNSAEncoding();
7801 const bool UseNSA = ST->hasNSAEncoding() &&
7802 VAddrs.size() >= ST->getNSAThreshold(MF) &&
7803 (VAddrs.size() <= NSAMaxSize || HasPartialNSAEncoding);
7804 const bool UsePartialNSA =
7805 UseNSA && HasPartialNSAEncoding && VAddrs.size() > NSAMaxSize;
7806
7807 SDValue VAddr;
7808 if (UsePartialNSA) {
7809 VAddr = getBuildDwordsVector(DAG, DL,
7810 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
7811 }
7812 else if (!UseNSA) {
7813 VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
7814 }
7815
7816 SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
7817 SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
7818 SDValue Unorm;
7819 if (!BaseOpcode->Sampler) {
7820 Unorm = True;
7821 } else {
7822 uint64_t UnormConst =
7823 Op.getConstantOperandVal(ArgOffset + Intr->UnormIndex);
7824
7825 Unorm = UnormConst ? True : False;
7826 }
7827
7828 SDValue TFE;
7829 SDValue LWE;
7830 SDValue TexFail = Op.getOperand(ArgOffset + Intr->TexFailCtrlIndex);
7831 bool IsTexFail = false;
7832 if (!parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
7833 return Op;
7834
7835 if (IsTexFail) {
7836 if (!DMaskLanes) {
7837 // Expecting to get an error flag since TFC is on - and dmask is 0
7838 // Force dmask to be at least 1 otherwise the instruction will fail
7839 DMask = 0x1;
7840 DMaskLanes = 1;
7841 NumVDataDwords = 1;
7842 }
7843 NumVDataDwords += 1;
7844 AdjustRetType = true;
7845 }
7846
7847 // Has something earlier tagged that the return type needs adjusting
7848 // This happens if the instruction is a load or has set TexFailCtrl flags
7849 if (AdjustRetType) {
7850 // NumVDataDwords reflects the true number of dwords required in the return type
7851 if (DMaskLanes == 0 && !BaseOpcode->Store) {
7852 // This is a no-op load. This can be eliminated
7853 SDValue Undef = DAG.getUNDEF(Op.getValueType());
7854 if (isa<MemSDNode>(Op))
7855 return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
7856 return Undef;
7857 }
7858
7859 EVT NewVT = NumVDataDwords > 1 ?
7860 EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumVDataDwords)
7861 : MVT::i32;
7862
7863 ResultTypes[0] = NewVT;
7864 if (ResultTypes.size() == 3) {
7865 // Original result was aggregate type used for TexFailCtrl results
7866 // The actual instruction returns as a vector type which has now been
7867 // created. Remove the aggregate result.
7868 ResultTypes.erase(&ResultTypes[1]);
7869 }
7870 }
7871
7872 unsigned CPol = Op.getConstantOperandVal(ArgOffset + Intr->CachePolicyIndex);
7873 if (BaseOpcode->Atomic)
7874 CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
7875 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
7877 return Op;
7878
7880 if (BaseOpcode->Store || BaseOpcode->Atomic)
7881 Ops.push_back(VData); // vdata
7882 if (UsePartialNSA) {
7883 append_range(Ops, ArrayRef(VAddrs).take_front(NSAMaxSize - 1));
7884 Ops.push_back(VAddr);
7885 }
7886 else if (UseNSA)
7887 append_range(Ops, VAddrs);
7888 else
7889 Ops.push_back(VAddr);
7890 Ops.push_back(Op.getOperand(ArgOffset + Intr->RsrcIndex));
7891 if (BaseOpcode->Sampler)
7892 Ops.push_back(Op.getOperand(ArgOffset + Intr->SampIndex));
7893 Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
7894 if (IsGFX10Plus)
7895 Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32));
7896 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
7897 Ops.push_back(Unorm);
7898 Ops.push_back(DAG.getTargetConstant(CPol, DL, MVT::i32));
7899 Ops.push_back(IsA16 && // r128, a16 for gfx9
7900 ST->hasFeature(AMDGPU::FeatureR128A16) ? True : False);
7901 if (IsGFX10Plus)
7902 Ops.push_back(IsA16 ? True : False);
7903 if (!Subtarget->hasGFX90AInsts()) {
7904 Ops.push_back(TFE); //tfe
7905 } else if (TFE->getAsZExtVal()) {
7906 report_fatal_error("TFE is not supported on this GPU");
7907 }
7908 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
7909 Ops.push_back(LWE); // lwe
7910 if (!IsGFX10Plus)
7911 Ops.push_back(DimInfo->DA ? True : False);
7912 if (BaseOpcode->HasD16)
7913 Ops.push_back(IsD16 ? True : False);
7914 if (isa<MemSDNode>(Op))
7915 Ops.push_back(Op.getOperand(0)); // chain
7916
7917 int NumVAddrDwords =
7918 UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32;
7919 int Opcode = -1;
7920
7921 if (IsGFX12Plus) {
7922 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
7923 NumVDataDwords, NumVAddrDwords);
7924 } else if (IsGFX11Plus) {
7925 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
7926 UseNSA ? AMDGPU::MIMGEncGfx11NSA
7927 : AMDGPU::MIMGEncGfx11Default,
7928 NumVDataDwords, NumVAddrDwords);
7929 } else if (IsGFX10Plus) {
7930 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
7931 UseNSA ? AMDGPU::MIMGEncGfx10NSA
7932 : AMDGPU::MIMGEncGfx10Default,
7933 NumVDataDwords, NumVAddrDwords);
7934 } else {
7935 if (Subtarget->hasGFX90AInsts()) {
7936 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
7937 NumVDataDwords, NumVAddrDwords);
7938 if (Opcode == -1)
7940 "requested image instruction is not supported on this GPU");
7941 }
7942 if (Opcode == -1 &&
7944 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
7945 NumVDataDwords, NumVAddrDwords);
7946 if (Opcode == -1)
7947 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
7948 NumVDataDwords, NumVAddrDwords);
7949 }
7950 if (Opcode == -1)
7951 return Op;
7952
7953 MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops);
7954 if (auto MemOp = dyn_cast<MemSDNode>(Op)) {
7955 MachineMemOperand *MemRef = MemOp->getMemOperand();
7956 DAG.setNodeMemRefs(NewNode, {MemRef});
7957 }
7958
7959 if (BaseOpcode->AtomicX2) {
7961 DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
7962 return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
7963 }
7964 if (BaseOpcode->Store)
7965 return SDValue(NewNode, 0);
7966 return constructRetValue(DAG, NewNode, OrigResultTypes, IsTexFail,
7967 Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
7968 NumVDataDwords, IsAtomicPacked16Bit, DL);
7969}
7970
7971SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
7972 SDValue Offset, SDValue CachePolicy,
7973 SelectionDAG &DAG) const {
7975
7976 const DataLayout &DataLayout = DAG.getDataLayout();
7977 Align Alignment =
7979
7984 VT.getStoreSize(), Alignment);
7985
7986 if (!Offset->isDivergent()) {
7987 SDValue Ops[] = {Rsrc, Offset, CachePolicy};
7988
7989 // Lower llvm.amdgcn.s.buffer.load.{i16, u16} intrinsics. Initially, the
7990 // s_buffer_load_u16 instruction is emitted for both signed and unsigned
7991 // loads. Later, DAG combiner tries to combine s_buffer_load_u16 with sext
7992 // and generates s_buffer_load_i16 (performSignExtendInRegCombine).
7993 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
7994 SDValue BufferLoad =
7996 DAG.getVTList(MVT::i32), Ops, VT, MMO);
7997 return DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
7998 }
7999
8000 // Widen vec3 load to vec4.
8001 if (VT.isVector() && VT.getVectorNumElements() == 3 &&
8002 !Subtarget->hasScalarDwordx3Loads()) {
8003 EVT WidenedVT =
8005 auto WidenedOp = DAG.getMemIntrinsicNode(
8006 AMDGPUISD::SBUFFER_LOAD, DL, DAG.getVTList(WidenedVT), Ops, WidenedVT,
8007 MF.getMachineMemOperand(MMO, 0, WidenedVT.getStoreSize()));
8008 auto Subvector = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, WidenedOp,
8009 DAG.getVectorIdxConstant(0, DL));
8010 return Subvector;
8011 }
8012
8014 DAG.getVTList(VT), Ops, VT, MMO);
8015 }
8016
8017 // We have a divergent offset. Emit a MUBUF buffer load instead. We can
8018 // assume that the buffer is unswizzled.
8019 SDValue Ops[] = {
8020 DAG.getEntryNode(), // Chain
8021 Rsrc, // rsrc
8022 DAG.getConstant(0, DL, MVT::i32), // vindex
8023 {}, // voffset
8024 {}, // soffset
8025 {}, // offset
8026 CachePolicy, // cachepolicy
8027 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
8028 };
8029 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
8030 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
8031 return handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
8032 }
8033
8035 unsigned NumLoads = 1;
8036 MVT LoadVT = VT.getSimpleVT();
8037 unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
8038 assert((LoadVT.getScalarType() == MVT::i32 ||
8039 LoadVT.getScalarType() == MVT::f32));
8040
8041 if (NumElts == 8 || NumElts == 16) {
8042 NumLoads = NumElts / 4;
8043 LoadVT = MVT::getVectorVT(LoadVT.getScalarType(), 4);
8044 }
8045
8046 SDVTList VTList = DAG.getVTList({LoadVT, MVT::Glue});
8047
8048 // Use the alignment to ensure that the required offsets will fit into the
8049 // immediate offsets.
8050 setBufferOffsets(Offset, DAG, &Ops[3],
8051 NumLoads > 1 ? Align(16 * NumLoads) : Align(4));
8052
8053 uint64_t InstOffset = Ops[5]->getAsZExtVal();
8054 for (unsigned i = 0; i < NumLoads; ++i) {
8055 Ops[5] = DAG.getTargetConstant(InstOffset + 16 * i, DL, MVT::i32);
8056 Loads.push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops,
8057 LoadVT, MMO, DAG));
8058 }
8059
8060 if (NumElts == 8 || NumElts == 16)
8061 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads);
8062
8063 return Loads[0];
8064}
8065
8066SDValue SITargetLowering::lowerWaveID(SelectionDAG &DAG, SDValue Op) const {
8067 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
8068 if (!Subtarget->hasArchitectedSGPRs())
8069 return {};
8070 SDLoc SL(Op);
8071 MVT VT = MVT::i32;
8072 SDValue TTMP8 = DAG.getCopyFromReg(DAG.getEntryNode(), SL, AMDGPU::TTMP8, VT);
8073 return DAG.getNode(AMDGPUISD::BFE_U32, SL, VT, TTMP8,
8074 DAG.getConstant(25, SL, VT), DAG.getConstant(5, SL, VT));
8075}
8076
8077SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op,
8078 unsigned Dim,
8079 const ArgDescriptor &Arg) const {
8080 SDLoc SL(Op);
8082 unsigned MaxID = Subtarget->getMaxWorkitemID(MF.getFunction(), Dim);
8083 if (MaxID == 0)
8084 return DAG.getConstant(0, SL, MVT::i32);
8085
8086 SDValue Val = loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
8087 SDLoc(DAG.getEntryNode()), Arg);
8088
8089 // Don't bother inserting AssertZext for packed IDs since we're emitting the
8090 // masking operations anyway.
8091 //
8092 // TODO: We could assert the top bit is 0 for the source copy.
8093 if (Arg.isMasked())
8094 return Val;
8095
8096 // Preserve the known bits after expansion to a copy.
8098 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Val,
8099 DAG.getValueType(SmallVT));
8100}
8101
8102SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
8103 SelectionDAG &DAG) const {
8105 auto MFI = MF.getInfo<SIMachineFunctionInfo>();
8106
8107 EVT VT = Op.getValueType();
8108 SDLoc DL(Op);
8109 unsigned IntrinsicID = Op.getConstantOperandVal(0);
8110
8111 // TODO: Should this propagate fast-math-flags?
8112
8113 switch (IntrinsicID) {
8114 case Intrinsic::amdgcn_implicit_buffer_ptr: {
8115 if (getSubtarget()->isAmdHsaOrMesa(MF.getFunction()))
8116 return emitNonHSAIntrinsicError(DAG, DL, VT);
8117 return getPreloadedValue(DAG, *MFI, VT,
8119 }
8120 case Intrinsic::amdgcn_dispatch_ptr:
8121 case Intrinsic::amdgcn_queue_ptr: {
8122 if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) {
8123 DiagnosticInfoUnsupported BadIntrin(
8124 MF.getFunction(), "unsupported hsa intrinsic without hsa target",
8125 DL.getDebugLoc());
8126 DAG.getContext()->diagnose(BadIntrin);
8127 return DAG.getUNDEF(VT);
8128 }
8129
8130 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ?
8132 return getPreloadedValue(DAG, *MFI, VT, RegID);
8133 }
8134 case Intrinsic::amdgcn_implicitarg_ptr: {
8135 if (MFI->isEntryFunction())
8136 return getImplicitArgPtr(DAG, DL);
8137 return getPreloadedValue(DAG, *MFI, VT,
8139 }
8140 case Intrinsic::amdgcn_kernarg_segment_ptr: {
8142 // This only makes sense to call in a kernel, so just lower to null.
8143 return DAG.getConstant(0, DL, VT);
8144 }
8145
8146 return getPreloadedValue(DAG, *MFI, VT,
8148 }
8149 case Intrinsic::amdgcn_dispatch_id: {
8150 return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);
8151 }
8152 case Intrinsic::amdgcn_rcp:
8153 return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
8154 case Intrinsic::amdgcn_rsq:
8155 return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
8156 case Intrinsic::amdgcn_rsq_legacy:
8158 return emitRemovedIntrinsicError(DAG, DL, VT);
8159 return SDValue();
8160 case Intrinsic::amdgcn_rcp_legacy:
8162 return emitRemovedIntrinsicError(DAG, DL, VT);
8163 return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
8164 case Intrinsic::amdgcn_rsq_clamp: {
8166 return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
8167
8168 Type *Type = VT.getTypeForEVT(*DAG.getContext());
8171
8172 SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
8173 SDValue Tmp = DAG.getNode(ISD::FMINNUM, DL, VT, Rsq,
8174 DAG.getConstantFP(Max, DL, VT));
8175 return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
8176 DAG.getConstantFP(Min, DL, VT));
8177 }
8178 case Intrinsic::r600_read_ngroups_x:
8179 if (Subtarget->isAmdHsaOS())
8180 return emitNonHSAIntrinsicError(DAG, DL, VT);
8181
8182 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8184 false);
8185 case Intrinsic::r600_read_ngroups_y:
8186 if (Subtarget->isAmdHsaOS())
8187 return emitNonHSAIntrinsicError(DAG, DL, VT);
8188
8189 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8191 false);
8192 case Intrinsic::r600_read_ngroups_z:
8193 if (Subtarget->isAmdHsaOS())
8194 return emitNonHSAIntrinsicError(DAG, DL, VT);
8195
8196 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8198 false);
8199 case Intrinsic::r600_read_global_size_x:
8200 if (Subtarget->isAmdHsaOS())
8201 return emitNonHSAIntrinsicError(DAG, DL, VT);
8202
8203 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8205 Align(4), false);
8206 case Intrinsic::r600_read_global_size_y:
8207 if (Subtarget->isAmdHsaOS())
8208 return emitNonHSAIntrinsicError(DAG, DL, VT);
8209
8210 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8212 Align(4), false);
8213 case Intrinsic::r600_read_global_size_z:
8214 if (Subtarget->isAmdHsaOS())
8215 return emitNonHSAIntrinsicError(DAG, DL, VT);
8216
8217 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8219 Align(4), false);
8220 case Intrinsic::r600_read_local_size_x:
8221 if (Subtarget->isAmdHsaOS())
8222 return emitNonHSAIntrinsicError(DAG, DL, VT);
8223
8224 return lowerImplicitZextParam(DAG, Op, MVT::i16,
8226 case Intrinsic::r600_read_local_size_y:
8227 if (Subtarget->isAmdHsaOS())
8228 return emitNonHSAIntrinsicError(DAG, DL, VT);
8229
8230 return lowerImplicitZextParam(DAG, Op, MVT::i16,
8232 case Intrinsic::r600_read_local_size_z:
8233 if (Subtarget->isAmdHsaOS())
8234 return emitNonHSAIntrinsicError(DAG, DL, VT);
8235
8236 return lowerImplicitZextParam(DAG, Op, MVT::i16,
8238 case Intrinsic::amdgcn_workgroup_id_x:
8239 return getPreloadedValue(DAG, *MFI, VT,
8241 case Intrinsic::amdgcn_workgroup_id_y:
8242 return getPreloadedValue(DAG, *MFI, VT,
8244 case Intrinsic::amdgcn_workgroup_id_z:
8245 return getPreloadedValue(DAG, *MFI, VT,
8247 case Intrinsic::amdgcn_wave_id:
8248 return lowerWaveID(DAG, Op);
8249 case Intrinsic::amdgcn_lds_kernel_id: {
8250 if (MFI->isEntryFunction())
8251 return getLDSKernelId(DAG, DL);
8252 return getPreloadedValue(DAG, *MFI, VT,
8254 }
8255 case Intrinsic::amdgcn_workitem_id_x:
8256 return lowerWorkitemID(DAG, Op, 0, MFI->getArgInfo().WorkItemIDX);
8257 case Intrinsic::amdgcn_workitem_id_y:
8258 return lowerWorkitemID(DAG, Op, 1, MFI->getArgInfo().WorkItemIDY);
8259 case Intrinsic::amdgcn_workitem_id_z:
8260 return lowerWorkitemID(DAG, Op, 2, MFI->getArgInfo().WorkItemIDZ);
8261 case Intrinsic::amdgcn_wavefrontsize:
8263 SDLoc(Op), MVT::i32);
8264 case Intrinsic::amdgcn_s_buffer_load: {
8265 unsigned CPol = Op.getConstantOperandVal(3);
8266 // s_buffer_load, because of how it's optimized, can't be volatile
8267 // so reject ones with the volatile bit set.
8268 if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12)
8271 return Op;
8272 return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
8273 DAG);
8274 }
8275 case Intrinsic::amdgcn_fdiv_fast:
8276 return lowerFDIV_FAST(Op, DAG);
8277 case Intrinsic::amdgcn_sin:
8278 return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
8279
8280 case Intrinsic::amdgcn_cos:
8281 return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
8282
8283 case Intrinsic::amdgcn_mul_u24:
8284 return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, Op.getOperand(1), Op.getOperand(2));
8285 case Intrinsic::amdgcn_mul_i24:
8286 return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT, Op.getOperand(1), Op.getOperand(2));
8287
8288 case Intrinsic::amdgcn_log_clamp: {
8290 return SDValue();
8291
8292 return emitRemovedIntrinsicError(DAG, DL, VT);
8293 }
8294 case Intrinsic::amdgcn_fract:
8295 return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
8296
8297 case Intrinsic::amdgcn_class:
8298 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT,
8299 Op.getOperand(1), Op.getOperand(2));
8300 case Intrinsic::amdgcn_div_fmas:
8301 return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT,
8302 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
8303 Op.getOperand(4));
8304
8305 case Intrinsic::amdgcn_div_fixup:
8306 return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT,
8307 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
8308
8309 case Intrinsic::amdgcn_div_scale: {
8310 const ConstantSDNode *Param = cast<ConstantSDNode>(Op.getOperand(3));
8311
8312 // Translate to the operands expected by the machine instruction. The
8313 // first parameter must be the same as the first instruction.
8314 SDValue Numerator = Op.getOperand(1);
8315 SDValue Denominator = Op.getOperand(2);
8316
8317 // Note this order is opposite of the machine instruction's operations,
8318 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
8319 // intrinsic has the numerator as the first operand to match a normal
8320 // division operation.
8321
8322 SDValue Src0 = Param->isAllOnes() ? Numerator : Denominator;
8323
8324 return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
8325 Denominator, Numerator);
8326 }
8327 case Intrinsic::amdgcn_icmp: {
8328 // There is a Pat that handles this variant, so return it as-is.
8329 if (Op.getOperand(1).getValueType() == MVT::i1 &&
8330 Op.getConstantOperandVal(2) == 0 &&
8331 Op.getConstantOperandVal(3) == ICmpInst::Predicate::ICMP_NE)
8332 return Op;
8333 return lowerICMPIntrinsic(*this, Op.getNode(), DAG);
8334 }
8335 case Intrinsic::amdgcn_fcmp: {
8336 return lowerFCMPIntrinsic(*this, Op.getNode(), DAG);
8337 }
8338 case Intrinsic::amdgcn_ballot:
8339 return lowerBALLOTIntrinsic(*this, Op.getNode(), DAG);
8340 case Intrinsic::amdgcn_fmed3:
8341 return DAG.getNode(AMDGPUISD::FMED3, DL, VT,
8342 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
8343 case Intrinsic::amdgcn_fdot2:
8344 return DAG.getNode(AMDGPUISD::FDOT2, DL, VT,
8345 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
8346 Op.getOperand(4));
8347 case Intrinsic::amdgcn_fmul_legacy:
8348 return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT,
8349 Op.getOperand(1), Op.getOperand(2));
8350 case Intrinsic::amdgcn_sffbh:
8351 return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
8352 case Intrinsic::amdgcn_sbfe:
8353 return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT,
8354 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
8355 case Intrinsic::amdgcn_ubfe:
8356 return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT,
8357 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
8358 case Intrinsic::amdgcn_cvt_pkrtz:
8359 case Intrinsic::amdgcn_cvt_pknorm_i16:
8360 case Intrinsic::amdgcn_cvt_pknorm_u16:
8361 case Intrinsic::amdgcn_cvt_pk_i16:
8362 case Intrinsic::amdgcn_cvt_pk_u16: {
8363 // FIXME: Stop adding cast if v2f16/v2i16 are legal.
8364 EVT VT = Op.getValueType();
8365 unsigned Opcode;
8366
8367 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
8369 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
8371 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
8373 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
8375 else
8377
8378 if (isTypeLegal(VT))
8379 return DAG.getNode(Opcode, DL, VT, Op.getOperand(1), Op.getOperand(2));
8380
8381 SDValue Node = DAG.getNode(Opcode, DL, MVT::i32,
8382 Op.getOperand(1), Op.getOperand(2));
8383 return DAG.getNode(ISD::BITCAST, DL, VT, Node);
8384 }
8385 case Intrinsic::amdgcn_fmad_ftz:
8386 return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),
8387 Op.getOperand(2), Op.getOperand(3));
8388
8389 case Intrinsic::amdgcn_if_break:
8390 return SDValue(DAG.getMachineNode(AMDGPU::SI_IF_BREAK, DL, VT,
8391 Op->getOperand(1), Op->getOperand(2)), 0);
8392
8393 case Intrinsic::amdgcn_groupstaticsize: {
8395 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
8396 return Op;
8397
8398 const Module *M = MF.getFunction().getParent();
8399 const GlobalValue *GV =
8400 M->getNamedValue(Intrinsic::getName(Intrinsic::amdgcn_groupstaticsize));
8401 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0,
8403 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
8404 }
8405 case Intrinsic::amdgcn_is_shared:
8406 case Intrinsic::amdgcn_is_private: {
8407 SDLoc SL(Op);
8408 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared) ?
8410 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
8411 SDValue SrcVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32,
8412 Op.getOperand(1));
8413
8414 SDValue SrcHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, SrcVec,
8415 DAG.getConstant(1, SL, MVT::i32));
8416 return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ);
8417 }
8418 case Intrinsic::amdgcn_perm:
8419 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1),
8420 Op.getOperand(2), Op.getOperand(3));
8421 case Intrinsic::amdgcn_reloc_constant: {
8422 Module *M = const_cast<Module *>(MF.getFunction().getParent());
8423 const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD();
8424 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
8425 auto RelocSymbol = cast<GlobalVariable>(
8426 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
8427 SDValue GA = DAG.getTargetGlobalAddress(RelocSymbol, DL, MVT::i32, 0,
8429 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
8430 }
8431 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
8432 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
8433 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
8434 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
8435 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
8436 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
8437 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
8438 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
8439 if (Op.getOperand(4).getValueType() == MVT::i32)
8440 return SDValue();
8441
8442 SDLoc SL(Op);
8443 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i32);
8444 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
8445 Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
8446 Op.getOperand(3), IndexKeyi32);
8447 }
8448 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
8449 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
8450 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
8451 if (Op.getOperand(6).getValueType() == MVT::i32)
8452 return SDValue();
8453
8454 SDLoc SL(Op);
8455 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, MVT::i32);
8456 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
8457 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
8458 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
8459 IndexKeyi32, Op.getOperand(7)});
8460 }
8461 case Intrinsic::amdgcn_addrspacecast_nonnull:
8462 return lowerADDRSPACECAST(Op, DAG);
8463 default:
8464 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
8466 return lowerImage(Op, ImageDimIntr, DAG, false);
8467
8468 return Op;
8469 }
8470}
8471
8472// On targets not supporting constant in soffset field, turn zero to
8473// SGPR_NULL to avoid generating an extra s_mov with zero.
8475 const GCNSubtarget *Subtarget) {
8476 if (Subtarget->hasRestrictedSOffset() && isNullConstant(SOffset))
8477 return DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32);
8478 return SOffset;
8479}
8480
8481SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
8482 SelectionDAG &DAG,
8483 unsigned NewOpcode) const {
8484 SDLoc DL(Op);
8485
8486 SDValue VData = Op.getOperand(2);
8487 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
8488 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
8489 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
8490 SDValue Ops[] = {
8491 Op.getOperand(0), // Chain
8492 VData, // vdata
8493 Rsrc, // rsrc
8494 DAG.getConstant(0, DL, MVT::i32), // vindex
8495 Offsets.first, // voffset
8496 SOffset, // soffset
8497 Offsets.second, // offset
8498 Op.getOperand(6), // cachepolicy
8499 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
8500 };
8501
8502 auto *M = cast<MemSDNode>(Op);
8503
8504 EVT MemVT = VData.getValueType();
8505 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
8506 M->getMemOperand());
8507}
8508
8509// Return a value to use for the idxen operand by examining the vindex operand.
8510static unsigned getIdxEn(SDValue VIndex) {
8511 // No need to set idxen if vindex is known to be zero.
8512 return isNullConstant(VIndex) ? 0 : 1;
8513}
8514
8515SDValue
8516SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
8517 unsigned NewOpcode) const {
8518 SDLoc DL(Op);
8519
8520 SDValue VData = Op.getOperand(2);
8521 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
8522 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
8523 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
8524 SDValue Ops[] = {
8525 Op.getOperand(0), // Chain
8526 VData, // vdata
8527 Rsrc, // rsrc
8528 Op.getOperand(4), // vindex
8529 Offsets.first, // voffset
8530 SOffset, // soffset
8531 Offsets.second, // offset
8532 Op.getOperand(7), // cachepolicy
8533 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
8534 };
8535
8536 auto *M = cast<MemSDNode>(Op);
8537
8538 EVT MemVT = VData.getValueType();
8539 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
8540 M->getMemOperand());
8541}
8542
8543SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
8544 SelectionDAG &DAG) const {
8545 unsigned IntrID = Op.getConstantOperandVal(1);
8546 SDLoc DL(Op);
8547
8548 switch (IntrID) {
8549 case Intrinsic::amdgcn_ds_ordered_add:
8550 case Intrinsic::amdgcn_ds_ordered_swap: {
8551 MemSDNode *M = cast<MemSDNode>(Op);
8552 SDValue Chain = M->getOperand(0);
8553 SDValue M0 = M->getOperand(2);
8554 SDValue Value = M->getOperand(3);
8555 unsigned IndexOperand = M->getConstantOperandVal(7);
8556 unsigned WaveRelease = M->getConstantOperandVal(8);
8557 unsigned WaveDone = M->getConstantOperandVal(9);
8558
8559 unsigned OrderedCountIndex = IndexOperand & 0x3f;
8560 IndexOperand &= ~0x3f;
8561 unsigned CountDw = 0;
8562
8563 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) {
8564 CountDw = (IndexOperand >> 24) & 0xf;
8565 IndexOperand &= ~(0xf << 24);
8566
8567 if (CountDw < 1 || CountDw > 4) {
8569 "ds_ordered_count: dword count must be between 1 and 4");
8570 }
8571 }
8572
8573 if (IndexOperand)
8574 report_fatal_error("ds_ordered_count: bad index operand");
8575
8576 if (WaveDone && !WaveRelease)
8577 report_fatal_error("ds_ordered_count: wave_done requires wave_release");
8578
8579 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
8580 unsigned ShaderType =
8582 unsigned Offset0 = OrderedCountIndex << 2;
8583 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
8584
8585 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10)
8586 Offset1 |= (CountDw - 1) << 6;
8587
8588 if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX11)
8589 Offset1 |= ShaderType << 2;
8590
8591 unsigned Offset = Offset0 | (Offset1 << 8);
8592
8593 SDValue Ops[] = {
8594 Chain,
8595 Value,
8596 DAG.getTargetConstant(Offset, DL, MVT::i16),
8597 copyToM0(DAG, Chain, DL, M0).getValue(1), // Glue
8598 };
8600 M->getVTList(), Ops, M->getMemoryVT(),
8601 M->getMemOperand());
8602 }
8603 case Intrinsic::amdgcn_ds_fadd: {
8604 MemSDNode *M = cast<MemSDNode>(Op);
8605 unsigned Opc;
8606 switch (IntrID) {
8607 case Intrinsic::amdgcn_ds_fadd:
8609 break;
8610 }
8611
8612 return DAG.getAtomic(Opc, SDLoc(Op), M->getMemoryVT(),
8613 M->getOperand(0), M->getOperand(2), M->getOperand(3),
8614 M->getMemOperand());
8615 }
8616 case Intrinsic::amdgcn_ds_fmin:
8617 case Intrinsic::amdgcn_ds_fmax: {
8618 MemSDNode *M = cast<MemSDNode>(Op);
8619 unsigned Opc;
8620 switch (IntrID) {
8621 case Intrinsic::amdgcn_ds_fmin:
8623 break;
8624 case Intrinsic::amdgcn_ds_fmax:
8626 break;
8627 default:
8628 llvm_unreachable("Unknown intrinsic!");
8629 }
8630 SDValue Ops[] = {
8631 M->getOperand(0), // Chain
8632 M->getOperand(2), // Ptr
8633 M->getOperand(3) // Value
8634 };
8635
8636 return DAG.getMemIntrinsicNode(Opc, SDLoc(Op), M->getVTList(), Ops,
8637 M->getMemoryVT(), M->getMemOperand());
8638 }
8639 case Intrinsic::amdgcn_buffer_load:
8640 case Intrinsic::amdgcn_buffer_load_format: {
8641 unsigned Glc = Op.getConstantOperandVal(5);
8642 unsigned Slc = Op.getConstantOperandVal(6);
8643 unsigned IdxEn = getIdxEn(Op.getOperand(3));
8644 SDValue Ops[] = {
8645 Op.getOperand(0), // Chain
8646 Op.getOperand(2), // rsrc
8647 Op.getOperand(3), // vindex
8648 SDValue(), // voffset -- will be set by setBufferOffsets
8649 SDValue(), // soffset -- will be set by setBufferOffsets
8650 SDValue(), // offset -- will be set by setBufferOffsets
8651 DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
8652 DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
8653 };
8654 setBufferOffsets(Op.getOperand(4), DAG, &Ops[3]);
8655
8656 unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ?
8658
8659 EVT VT = Op.getValueType();
8660 EVT IntVT = VT.changeTypeToInteger();
8661 auto *M = cast<MemSDNode>(Op);
8662 EVT LoadVT = Op.getValueType();
8663
8664 if (LoadVT.getScalarType() == MVT::f16)
8665 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16,
8666 M, DAG, Ops);
8667
8668 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
8669 if (LoadVT.getScalarType() == MVT::i8 || LoadVT.getScalarType() == MVT::i16)
8670 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops,
8671 M->getMemOperand());
8672
8673 return getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
8674 M->getMemOperand(), DAG);
8675 }
8676 case Intrinsic::amdgcn_raw_buffer_load:
8677 case Intrinsic::amdgcn_raw_ptr_buffer_load:
8678 case Intrinsic::amdgcn_raw_buffer_load_format:
8679 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
8680 const bool IsFormat =
8681 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
8682 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
8683
8684 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
8685 auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG);
8686 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
8687 SDValue Ops[] = {
8688 Op.getOperand(0), // Chain
8689 Rsrc, // rsrc
8690 DAG.getConstant(0, DL, MVT::i32), // vindex
8691 Offsets.first, // voffset
8692 SOffset, // soffset
8693 Offsets.second, // offset
8694 Op.getOperand(5), // cachepolicy, swizzled buffer
8695 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
8696 };
8697
8698 auto *M = cast<MemSDNode>(Op);
8699 return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
8700 }
8701 case Intrinsic::amdgcn_struct_buffer_load:
8702 case Intrinsic::amdgcn_struct_ptr_buffer_load:
8703 case Intrinsic::amdgcn_struct_buffer_load_format:
8704 case Intrinsic::amdgcn_struct_ptr_buffer_load_format: {
8705 const bool IsFormat =
8706 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
8707 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
8708
8709 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
8710 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
8711 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
8712 SDValue Ops[] = {
8713 Op.getOperand(0), // Chain
8714 Rsrc, // rsrc
8715 Op.getOperand(3), // vindex
8716 Offsets.first, // voffset
8717 SOffset, // soffset
8718 Offsets.second, // offset
8719 Op.getOperand(6), // cachepolicy, swizzled buffer
8720 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
8721 };
8722
8723 return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops);
8724 }
8725 case Intrinsic::amdgcn_tbuffer_load: {
8726 MemSDNode *M = cast<MemSDNode>(Op);
8727 EVT LoadVT = Op.getValueType();
8728
8729 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
8730 unsigned Dfmt = Op.getConstantOperandVal(7);
8731 unsigned Nfmt = Op.getConstantOperandVal(8);
8732 unsigned Glc = Op.getConstantOperandVal(9);
8733 unsigned Slc = Op.getConstantOperandVal(10);
8734 unsigned IdxEn = getIdxEn(Op.getOperand(3));
8735 SDValue Ops[] = {
8736 Op.getOperand(0), // Chain
8737 Op.getOperand(2), // rsrc
8738 Op.getOperand(3), // vindex
8739 Op.getOperand(4), // voffset
8740 SOffset, // soffset
8741 Op.getOperand(6), // offset
8742 DAG.getTargetConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
8743 DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
8744 DAG.getTargetConstant(IdxEn, DL, MVT::i1) // idxen
8745 };
8746
8747 if (LoadVT.getScalarType() == MVT::f16)
8748 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
8749 M, DAG, Ops);
8750 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
8751 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
8752 DAG);
8753 }
8754 case Intrinsic::amdgcn_raw_tbuffer_load:
8755 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
8756 MemSDNode *M = cast<MemSDNode>(Op);
8757 EVT LoadVT = Op.getValueType();
8758 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
8759 auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG);
8760 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
8761
8762 SDValue Ops[] = {
8763 Op.getOperand(0), // Chain
8764 Rsrc, // rsrc
8765 DAG.getConstant(0, DL, MVT::i32), // vindex
8766 Offsets.first, // voffset
8767 SOffset, // soffset
8768 Offsets.second, // offset
8769 Op.getOperand(5), // format
8770 Op.getOperand(6), // cachepolicy, swizzled buffer
8771 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
8772 };
8773
8774 if (LoadVT.getScalarType() == MVT::f16)
8775 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
8776 M, DAG, Ops);
8777 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
8778 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
8779 DAG);
8780 }
8781 case Intrinsic::amdgcn_struct_tbuffer_load:
8782 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
8783 MemSDNode *M = cast<MemSDNode>(Op);
8784 EVT LoadVT = Op.getValueType();
8785 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
8786 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
8787 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
8788
8789 SDValue Ops[] = {
8790 Op.getOperand(0), // Chain
8791 Rsrc, // rsrc
8792 Op.getOperand(3), // vindex
8793 Offsets.first, // voffset
8794 SOffset, // soffset
8795 Offsets.second, // offset
8796 Op.getOperand(6), // format
8797 Op.getOperand(7), // cachepolicy, swizzled buffer
8798 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
8799 };
8800
8801 if (LoadVT.getScalarType() == MVT::f16)
8802 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
8803 M, DAG, Ops);
8804 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
8805 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
8806 DAG);
8807 }
8808 case Intrinsic::amdgcn_buffer_atomic_swap:
8809 case Intrinsic::amdgcn_buffer_atomic_add:
8810 case Intrinsic::amdgcn_buffer_atomic_sub:
8811 case Intrinsic::amdgcn_buffer_atomic_csub:
8812 case Intrinsic::amdgcn_buffer_atomic_smin:
8813 case Intrinsic::amdgcn_buffer_atomic_umin:
8814 case Intrinsic::amdgcn_buffer_atomic_smax:
8815 case Intrinsic::amdgcn_buffer_atomic_umax:
8816 case Intrinsic::amdgcn_buffer_atomic_and:
8817 case Intrinsic::amdgcn_buffer_atomic_or:
8818 case Intrinsic::amdgcn_buffer_atomic_xor:
8819 case Intrinsic::amdgcn_buffer_atomic_fadd: {
8820 unsigned Slc = Op.getConstantOperandVal(6);
8821 unsigned IdxEn = getIdxEn(Op.getOperand(4));
8822 SDValue Ops[] = {
8823 Op.getOperand(0), // Chain
8824 Op.getOperand(2), // vdata
8825 Op.getOperand(3), // rsrc
8826 Op.getOperand(4), // vindex
8827 SDValue(), // voffset -- will be set by setBufferOffsets
8828 SDValue(), // soffset -- will be set by setBufferOffsets
8829 SDValue(), // offset -- will be set by setBufferOffsets
8830 DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy
8831 DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
8832 };
8833 setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
8834
8835 EVT VT = Op.getValueType();
8836
8837 auto *M = cast<MemSDNode>(Op);
8838 unsigned Opcode = 0;
8839
8840 switch (IntrID) {
8841 case Intrinsic::amdgcn_buffer_atomic_swap:
8843 break;
8844 case Intrinsic::amdgcn_buffer_atomic_add:
8846 break;
8847 case Intrinsic::amdgcn_buffer_atomic_sub:
8849 break;
8850 case Intrinsic::amdgcn_buffer_atomic_csub:
8852 break;
8853 case Intrinsic::amdgcn_buffer_atomic_smin:
8855 break;
8856 case Intrinsic::amdgcn_buffer_atomic_umin:
8858 break;
8859 case Intrinsic::amdgcn_buffer_atomic_smax:
8861 break;
8862 case Intrinsic::amdgcn_buffer_atomic_umax:
8864 break;
8865 case Intrinsic::amdgcn_buffer_atomic_and:
8867 break;
8868 case Intrinsic::amdgcn_buffer_atomic_or:
8870 break;
8871 case Intrinsic::amdgcn_buffer_atomic_xor:
8873 break;
8874 case Intrinsic::amdgcn_buffer_atomic_fadd:
8876 break;
8877 default:
8878 llvm_unreachable("unhandled atomic opcode");
8879 }
8880
8881 return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
8882 M->getMemOperand());
8883 }
8884 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
8885 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
8886 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
8887 case Intrinsic::amdgcn_raw_buffer_atomic_fadd_v2bf16:
8888 return lowerRawBufferAtomicIntrin(Op, DAG,
8890 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
8891 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
8892 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
8893 case Intrinsic::amdgcn_struct_buffer_atomic_fadd_v2bf16:
8894 return lowerStructBufferAtomicIntrin(Op, DAG,
8896 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
8897 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
8898 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
8899 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
8900 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
8901 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
8902 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
8903 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
8904 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
8905 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
8906 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
8907 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
8908 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
8909 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
8910 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP);
8911 case Intrinsic::amdgcn_raw_buffer_atomic_add:
8912 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
8913 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
8914 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
8915 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
8916 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
8917 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
8918 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
8919 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMIN);
8920 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
8921 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
8922 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMIN);
8923 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
8924 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
8925 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMAX);
8926 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
8927 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
8928 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMAX);
8929 case Intrinsic::amdgcn_raw_buffer_atomic_and:
8930 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
8931 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
8932 case Intrinsic::amdgcn_raw_buffer_atomic_or:
8933 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
8934 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
8935 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
8936 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
8937 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
8938 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
8939 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
8940 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
8941 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
8942 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
8943 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
8944 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
8945 return lowerRawBufferAtomicIntrin(Op, DAG,
8947 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
8948 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
8949 return lowerStructBufferAtomicIntrin(Op, DAG,
8951 case Intrinsic::amdgcn_struct_buffer_atomic_add:
8952 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
8953 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
8954 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
8955 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
8956 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
8957 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
8958 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
8959 return lowerStructBufferAtomicIntrin(Op, DAG,
8961 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
8962 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
8963 return lowerStructBufferAtomicIntrin(Op, DAG,
8965 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
8966 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
8967 return lowerStructBufferAtomicIntrin(Op, DAG,
8969 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
8970 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
8971 return lowerStructBufferAtomicIntrin(Op, DAG,
8973 case Intrinsic::amdgcn_struct_buffer_atomic_and:
8974 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
8975 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
8976 case Intrinsic::amdgcn_struct_buffer_atomic_or:
8977 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
8978 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
8979 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
8980 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
8981 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
8982 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
8983 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
8984 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
8985 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
8986 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
8987 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
8988 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
8989 return lowerStructBufferAtomicIntrin(Op, DAG,
8991
8992 case Intrinsic::amdgcn_buffer_atomic_cmpswap: {
8993 unsigned Slc = Op.getConstantOperandVal(7);
8994 unsigned IdxEn = getIdxEn(Op.getOperand(5));
8995 SDValue Ops[] = {
8996 Op.getOperand(0), // Chain
8997 Op.getOperand(2), // src
8998 Op.getOperand(3), // cmp
8999 Op.getOperand(4), // rsrc
9000 Op.getOperand(5), // vindex
9001 SDValue(), // voffset -- will be set by setBufferOffsets
9002 SDValue(), // soffset -- will be set by setBufferOffsets
9003 SDValue(), // offset -- will be set by setBufferOffsets
9004 DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy
9005 DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
9006 };
9007 setBufferOffsets(Op.getOperand(6), DAG, &Ops[5]);
9008
9009 EVT VT = Op.getValueType();
9010 auto *M = cast<MemSDNode>(Op);
9011
9013 Op->getVTList(), Ops, VT, M->getMemOperand());
9014 }
9015 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
9016 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
9017 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG);
9018 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
9019 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
9020 SDValue Ops[] = {
9021 Op.getOperand(0), // Chain
9022 Op.getOperand(2), // src
9023 Op.getOperand(3), // cmp
9024 Rsrc, // rsrc
9025 DAG.getConstant(0, DL, MVT::i32), // vindex
9026 Offsets.first, // voffset
9027 SOffset, // soffset
9028 Offsets.second, // offset
9029 Op.getOperand(7), // cachepolicy
9030 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9031 };
9032 EVT VT = Op.getValueType();
9033 auto *M = cast<MemSDNode>(Op);
9034
9036 Op->getVTList(), Ops, VT, M->getMemOperand());
9037 }
9038 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
9039 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
9040 SDValue Rsrc = bufferRsrcPtrToVector(Op->getOperand(4), DAG);
9041 auto Offsets = splitBufferOffsets(Op.getOperand(6), DAG);
9042 auto SOffset = selectSOffset(Op.getOperand(7), DAG, Subtarget);
9043 SDValue Ops[] = {
9044 Op.getOperand(0), // Chain
9045 Op.getOperand(2), // src
9046 Op.getOperand(3), // cmp
9047 Rsrc, // rsrc
9048 Op.getOperand(5), // vindex
9049 Offsets.first, // voffset
9050 SOffset, // soffset
9051 Offsets.second, // offset
9052 Op.getOperand(8), // cachepolicy
9053 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9054 };
9055 EVT VT = Op.getValueType();
9056 auto *M = cast<MemSDNode>(Op);
9057
9059 Op->getVTList(), Ops, VT, M->getMemOperand());
9060 }
9061 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
9062 MemSDNode *M = cast<MemSDNode>(Op);
9063 SDValue NodePtr = M->getOperand(2);
9064 SDValue RayExtent = M->getOperand(3);
9065 SDValue RayOrigin = M->getOperand(4);
9066 SDValue RayDir = M->getOperand(5);
9067 SDValue RayInvDir = M->getOperand(6);
9068 SDValue TDescr = M->getOperand(7);
9069
9070 assert(NodePtr.getValueType() == MVT::i32 ||
9071 NodePtr.getValueType() == MVT::i64);
9072 assert(RayDir.getValueType() == MVT::v3f16 ||
9073 RayDir.getValueType() == MVT::v3f32);
9074
9075 if (!Subtarget->hasGFX10_AEncoding()) {
9076 emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
9077 return SDValue();
9078 }
9079
9080 const bool IsGFX11 = AMDGPU::isGFX11(*Subtarget);
9081 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
9082 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
9083 const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
9084 const bool Is64 = NodePtr.getValueType() == MVT::i64;
9085 const unsigned NumVDataDwords = 4;
9086 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
9087 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
9088 const bool UseNSA = (Subtarget->hasNSAEncoding() &&
9089 NumVAddrs <= Subtarget->getNSAMaxSize()) ||
9090 IsGFX12Plus;
9091 const unsigned BaseOpcodes[2][2] = {
9092 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
9093 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
9094 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
9095 int Opcode;
9096 if (UseNSA) {
9097 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
9098 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
9099 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
9100 : AMDGPU::MIMGEncGfx10NSA,
9101 NumVDataDwords, NumVAddrDwords);
9102 } else {
9103 assert(!IsGFX12Plus);
9104 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
9105 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
9106 : AMDGPU::MIMGEncGfx10Default,
9107 NumVDataDwords, NumVAddrDwords);
9108 }
9109 assert(Opcode != -1);
9110
9112
9113 auto packLanes = [&DAG, &Ops, &DL] (SDValue Op, bool IsAligned) {
9115 DAG.ExtractVectorElements(Op, Lanes, 0, 3);
9116 if (Lanes[0].getValueSizeInBits() == 32) {
9117 for (unsigned I = 0; I < 3; ++I)
9118 Ops.push_back(DAG.getBitcast(MVT::i32, Lanes[I]));
9119 } else {
9120 if (IsAligned) {
9121 Ops.push_back(
9122 DAG.getBitcast(MVT::i32,
9123 DAG.getBuildVector(MVT::v2f16, DL,
9124 { Lanes[0], Lanes[1] })));
9125 Ops.push_back(Lanes[2]);
9126 } else {
9127 SDValue Elt0 = Ops.pop_back_val();
9128 Ops.push_back(
9129 DAG.getBitcast(MVT::i32,
9130 DAG.getBuildVector(MVT::v2f16, DL,
9131 { Elt0, Lanes[0] })));
9132 Ops.push_back(
9133 DAG.getBitcast(MVT::i32,
9134 DAG.getBuildVector(MVT::v2f16, DL,
9135 { Lanes[1], Lanes[2] })));
9136 }
9137 }
9138 };
9139
9140 if (UseNSA && IsGFX11Plus) {
9141 Ops.push_back(NodePtr);
9142 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
9143 Ops.push_back(RayOrigin);
9144 if (IsA16) {
9145 SmallVector<SDValue, 3> DirLanes, InvDirLanes, MergedLanes;
9146 DAG.ExtractVectorElements(RayDir, DirLanes, 0, 3);
9147 DAG.ExtractVectorElements(RayInvDir, InvDirLanes, 0, 3);
9148 for (unsigned I = 0; I < 3; ++I) {
9149 MergedLanes.push_back(DAG.getBitcast(
9150 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL,
9151 {DirLanes[I], InvDirLanes[I]})));
9152 }
9153 Ops.push_back(DAG.getBuildVector(MVT::v3i32, DL, MergedLanes));
9154 } else {
9155 Ops.push_back(RayDir);
9156 Ops.push_back(RayInvDir);
9157 }
9158 } else {
9159 if (Is64)
9160 DAG.ExtractVectorElements(DAG.getBitcast(MVT::v2i32, NodePtr), Ops, 0,
9161 2);
9162 else
9163 Ops.push_back(NodePtr);
9164
9165 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
9166 packLanes(RayOrigin, true);
9167 packLanes(RayDir, true);
9168 packLanes(RayInvDir, false);
9169 }
9170
9171 if (!UseNSA) {
9172 // Build a single vector containing all the operands so far prepared.
9173 if (NumVAddrDwords > 12) {
9174 SDValue Undef = DAG.getUNDEF(MVT::i32);
9175 Ops.append(16 - Ops.size(), Undef);
9176 }
9177 assert(Ops.size() >= 8 && Ops.size() <= 12);
9178 SDValue MergedOps = DAG.getBuildVector(
9179 MVT::getVectorVT(MVT::i32, Ops.size()), DL, Ops);
9180 Ops.clear();
9181 Ops.push_back(MergedOps);
9182 }
9183
9184 Ops.push_back(TDescr);
9185 Ops.push_back(DAG.getTargetConstant(IsA16, DL, MVT::i1));
9186 Ops.push_back(M->getChain());
9187
9188 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
9189 MachineMemOperand *MemRef = M->getMemOperand();
9190 DAG.setNodeMemRefs(NewNode, {MemRef});
9191 return SDValue(NewNode, 0);
9192 }
9193 case Intrinsic::amdgcn_global_atomic_fmin:
9194 case Intrinsic::amdgcn_global_atomic_fmax:
9195 case Intrinsic::amdgcn_global_atomic_fmin_num:
9196 case Intrinsic::amdgcn_global_atomic_fmax_num:
9197 case Intrinsic::amdgcn_flat_atomic_fmin:
9198 case Intrinsic::amdgcn_flat_atomic_fmax:
9199 case Intrinsic::amdgcn_flat_atomic_fmin_num:
9200 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9201 MemSDNode *M = cast<MemSDNode>(Op);
9202 SDValue Ops[] = {
9203 M->getOperand(0), // Chain
9204 M->getOperand(2), // Ptr
9205 M->getOperand(3) // Value
9206 };
9207 unsigned Opcode = 0;
9208 switch (IntrID) {
9209 case Intrinsic::amdgcn_global_atomic_fmin:
9210 case Intrinsic::amdgcn_global_atomic_fmin_num:
9211 case Intrinsic::amdgcn_flat_atomic_fmin:
9212 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
9214 break;
9215 }
9216 case Intrinsic::amdgcn_global_atomic_fmax:
9217 case Intrinsic::amdgcn_global_atomic_fmax_num:
9218 case Intrinsic::amdgcn_flat_atomic_fmax:
9219 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9221 break;
9222 }
9223 default:
9224 llvm_unreachable("unhandled atomic opcode");
9225 }
9226 return DAG.getMemIntrinsicNode(Opcode, SDLoc(Op),
9227 M->getVTList(), Ops, M->getMemoryVT(),
9228 M->getMemOperand());
9229 }
9230 case Intrinsic::amdgcn_s_get_barrier_state: {
9231 SDValue Chain = Op->getOperand(0);
9233 unsigned Opc;
9234 bool IsInlinableBarID = false;
9235 int64_t BarID;
9236
9237 if (isa<ConstantSDNode>(Op->getOperand(2))) {
9238 BarID = cast<ConstantSDNode>(Op->getOperand(2))->getSExtValue();
9239 IsInlinableBarID = AMDGPU::isInlinableIntLiteral(BarID);
9240 }
9241
9242 if (IsInlinableBarID) {
9243 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
9244 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
9245 Ops.push_back(K);
9246 } else {
9247 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
9248 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(2));
9249 Ops.push_back(M0Val.getValue(0));
9250 }
9251
9252 auto NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
9253 return SDValue(NewMI, 0);
9254 }
9255 default:
9256
9257 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
9259 return lowerImage(Op, ImageDimIntr, DAG, true);
9260
9261 return SDValue();
9262 }
9263}
9264
9265// Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
9266// dwordx4 if on SI and handle TFE loads.
9267SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,
9268 SDVTList VTList,
9269 ArrayRef<SDValue> Ops, EVT MemVT,
9270 MachineMemOperand *MMO,
9271 SelectionDAG &DAG) const {
9272 LLVMContext &C = *DAG.getContext();
9274 EVT VT = VTList.VTs[0];
9275
9276 assert(VTList.NumVTs == 2 || VTList.NumVTs == 3);
9277 bool IsTFE = VTList.NumVTs == 3;
9278 if (IsTFE) {
9279 unsigned NumValueDWords = divideCeil(VT.getSizeInBits(), 32);
9280 unsigned NumOpDWords = NumValueDWords + 1;
9281 EVT OpDWordsVT = EVT::getVectorVT(C, MVT::i32, NumOpDWords);
9282 SDVTList OpDWordsVTList = DAG.getVTList(OpDWordsVT, VTList.VTs[2]);
9283 MachineMemOperand *OpDWordsMMO =
9284 MF.getMachineMemOperand(MMO, 0, NumOpDWords * 4);
9285 SDValue Op = getMemIntrinsicNode(Opcode, DL, OpDWordsVTList, Ops,
9286 OpDWordsVT, OpDWordsMMO, DAG);
9288 DAG.getVectorIdxConstant(NumValueDWords, DL));
9289 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
9290 SDValue ValueDWords =
9291 NumValueDWords == 1
9292 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op, ZeroIdx)
9294 EVT::getVectorVT(C, MVT::i32, NumValueDWords), Op,
9295 ZeroIdx);
9296 SDValue Value = DAG.getNode(ISD::BITCAST, DL, VT, ValueDWords);
9297 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
9298 }
9299
9300 if (!Subtarget->hasDwordx3LoadStores() &&
9301 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
9302 EVT WidenedVT = EVT::getVectorVT(C, VT.getVectorElementType(), 4);
9303 EVT WidenedMemVT = EVT::getVectorVT(C, MemVT.getVectorElementType(), 4);
9304 MachineMemOperand *WidenedMMO = MF.getMachineMemOperand(MMO, 0, 16);
9305 SDVTList WidenedVTList = DAG.getVTList(WidenedVT, VTList.VTs[1]);
9306 SDValue Op = DAG.getMemIntrinsicNode(Opcode, DL, WidenedVTList, Ops,
9307 WidenedMemVT, WidenedMMO);
9309 DAG.getVectorIdxConstant(0, DL));
9310 return DAG.getMergeValues({Value, SDValue(Op.getNode(), 1)}, DL);
9311 }
9312
9313 return DAG.getMemIntrinsicNode(Opcode, DL, VTList, Ops, MemVT, MMO);
9314}
9315
9316SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG,
9317 bool ImageStore) const {
9318 EVT StoreVT = VData.getValueType();
9319
9320 // No change for f16 and legal vector D16 types.
9321 if (!StoreVT.isVector())
9322 return VData;
9323
9324 SDLoc DL(VData);
9325 unsigned NumElements = StoreVT.getVectorNumElements();
9326
9327 if (Subtarget->hasUnpackedD16VMem()) {
9328 // We need to unpack the packed data to store.
9329 EVT IntStoreVT = StoreVT.changeTypeToInteger();
9330 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
9331
9332 EVT EquivStoreVT =
9333 EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElements);
9334 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
9335 return DAG.UnrollVectorOp(ZExt.getNode());
9336 }
9337
9338 // The sq block of gfx8.1 does not estimate register use correctly for d16
9339 // image store instructions. The data operand is computed as if it were not a
9340 // d16 image instruction.
9341 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
9342 // Bitcast to i16
9343 EVT IntStoreVT = StoreVT.changeTypeToInteger();
9344 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
9345
9346 // Decompose into scalars
9348 DAG.ExtractVectorElements(IntVData, Elts);
9349
9350 // Group pairs of i16 into v2i16 and bitcast to i32
9351 SmallVector<SDValue, 4> PackedElts;
9352 for (unsigned I = 0; I < Elts.size() / 2; I += 1) {
9353 SDValue Pair =
9354 DAG.getBuildVector(MVT::v2i16, DL, {Elts[I * 2], Elts[I * 2 + 1]});
9355 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
9356 PackedElts.push_back(IntPair);
9357 }
9358 if ((NumElements % 2) == 1) {
9359 // Handle v3i16
9360 unsigned I = Elts.size() / 2;
9361 SDValue Pair = DAG.getBuildVector(MVT::v2i16, DL,
9362 {Elts[I * 2], DAG.getUNDEF(MVT::i16)});
9363 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
9364 PackedElts.push_back(IntPair);
9365 }
9366
9367 // Pad using UNDEF
9368 PackedElts.resize(Elts.size(), DAG.getUNDEF(MVT::i32));
9369
9370 // Build final vector
9371 EVT VecVT =
9372 EVT::getVectorVT(*DAG.getContext(), MVT::i32, PackedElts.size());
9373 return DAG.getBuildVector(VecVT, DL, PackedElts);
9374 }
9375
9376 if (NumElements == 3) {
9377 EVT IntStoreVT =
9379 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
9380
9381 EVT WidenedStoreVT = EVT::getVectorVT(
9382 *DAG.getContext(), StoreVT.getVectorElementType(), NumElements + 1);
9383 EVT WidenedIntVT = EVT::getIntegerVT(*DAG.getContext(),
9384 WidenedStoreVT.getStoreSizeInBits());
9385 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenedIntVT, IntVData);
9386 return DAG.getNode(ISD::BITCAST, DL, WidenedStoreVT, ZExt);
9387 }
9388
9389 assert(isTypeLegal(StoreVT));
9390 return VData;
9391}
9392
9393SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
9394 SelectionDAG &DAG) const {
9395 SDLoc DL(Op);
9396 SDValue Chain = Op.getOperand(0);
9397 unsigned IntrinsicID = Op.getConstantOperandVal(1);
9399
9400 switch (IntrinsicID) {
9401 case Intrinsic::amdgcn_exp_compr: {
9402 if (!Subtarget->hasCompressedExport()) {
9403 DiagnosticInfoUnsupported BadIntrin(
9405 "intrinsic not supported on subtarget", DL.getDebugLoc());
9406 DAG.getContext()->diagnose(BadIntrin);
9407 }
9408 SDValue Src0 = Op.getOperand(4);
9409 SDValue Src1 = Op.getOperand(5);
9410 // Hack around illegal type on SI by directly selecting it.
9411 if (isTypeLegal(Src0.getValueType()))
9412 return SDValue();
9413
9414 const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
9415 SDValue Undef = DAG.getUNDEF(MVT::f32);
9416 const SDValue Ops[] = {
9417 Op.getOperand(2), // tgt
9418 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0), // src0
9419 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1), // src1
9420 Undef, // src2
9421 Undef, // src3
9422 Op.getOperand(7), // vm
9423 DAG.getTargetConstant(1, DL, MVT::i1), // compr
9424 Op.getOperand(3), // en
9425 Op.getOperand(0) // Chain
9426 };
9427
9428 unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
9429 return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0);
9430 }
9431 case Intrinsic::amdgcn_s_barrier: {
9434 unsigned WGSize = ST.getFlatWorkGroupSizes(MF.getFunction()).second;
9435 if (WGSize <= ST.getWavefrontSize())
9436 return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL, MVT::Other,
9437 Op.getOperand(0)), 0);
9438 }
9439
9440 // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
9441 if (ST.hasSplitBarriers()) {
9442 SDValue K =
9444 SDValue BarSignal =
9445 SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_SIGNAL_IMM, DL,
9446 MVT::Other, K, Op.getOperand(0)),
9447 0);
9448 SDValue BarWait =
9449 SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_WAIT, DL, MVT::Other, K,
9450 BarSignal.getValue(0)),
9451 0);
9452 return BarWait;
9453 }
9454
9455 return SDValue();
9456 };
9457 case Intrinsic::amdgcn_tbuffer_store: {
9458 SDValue VData = Op.getOperand(2);
9459 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
9460 if (IsD16)
9461 VData = handleD16VData(VData, DAG);
9462 unsigned Dfmt = Op.getConstantOperandVal(8);
9463 unsigned Nfmt = Op.getConstantOperandVal(9);
9464 unsigned Glc = Op.getConstantOperandVal(10);
9465 unsigned Slc = Op.getConstantOperandVal(11);
9466 unsigned IdxEn = getIdxEn(Op.getOperand(4));
9467 SDValue Ops[] = {
9468 Chain,
9469 VData, // vdata
9470 Op.getOperand(3), // rsrc
9471 Op.getOperand(4), // vindex
9472 Op.getOperand(5), // voffset
9473 Op.getOperand(6), // soffset
9474 Op.getOperand(7), // offset
9475 DAG.getTargetConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
9476 DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
9477 DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
9478 };
9479 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
9481 MemSDNode *M = cast<MemSDNode>(Op);
9482 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9483 M->getMemoryVT(), M->getMemOperand());
9484 }
9485
9486 case Intrinsic::amdgcn_struct_tbuffer_store:
9487 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
9488 SDValue VData = Op.getOperand(2);
9489 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
9490 if (IsD16)
9491 VData = handleD16VData(VData, DAG);
9492 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9493 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
9494 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
9495 SDValue Ops[] = {
9496 Chain,
9497 VData, // vdata
9498 Rsrc, // rsrc
9499 Op.getOperand(4), // vindex
9500 Offsets.first, // voffset
9501 SOffset, // soffset
9502 Offsets.second, // offset
9503 Op.getOperand(7), // format
9504 Op.getOperand(8), // cachepolicy, swizzled buffer
9505 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9506 };
9507 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
9509 MemSDNode *M = cast<MemSDNode>(Op);
9510 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9511 M->getMemoryVT(), M->getMemOperand());
9512 }
9513
9514 case Intrinsic::amdgcn_raw_tbuffer_store:
9515 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
9516 SDValue VData = Op.getOperand(2);
9517 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
9518 if (IsD16)
9519 VData = handleD16VData(VData, DAG);
9520 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9521 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
9522 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9523 SDValue Ops[] = {
9524 Chain,
9525 VData, // vdata
9526 Rsrc, // rsrc
9527 DAG.getConstant(0, DL, MVT::i32), // vindex
9528 Offsets.first, // voffset
9529 SOffset, // soffset
9530 Offsets.second, // offset
9531 Op.getOperand(6), // format
9532 Op.getOperand(7), // cachepolicy, swizzled buffer
9533 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9534 };
9535 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
9537 MemSDNode *M = cast<MemSDNode>(Op);
9538 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9539 M->getMemoryVT(), M->getMemOperand());
9540 }
9541
9542 case Intrinsic::amdgcn_buffer_store:
9543 case Intrinsic::amdgcn_buffer_store_format: {
9544 SDValue VData = Op.getOperand(2);
9545 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
9546 if (IsD16)
9547 VData = handleD16VData(VData, DAG);
9548 unsigned Glc = Op.getConstantOperandVal(6);
9549 unsigned Slc = Op.getConstantOperandVal(7);
9550 unsigned IdxEn = getIdxEn(Op.getOperand(4));
9551 SDValue Ops[] = {
9552 Chain,
9553 VData,
9554 Op.getOperand(3), // rsrc
9555 Op.getOperand(4), // vindex
9556 SDValue(), // voffset -- will be set by setBufferOffsets
9557 SDValue(), // soffset -- will be set by setBufferOffsets
9558 SDValue(), // offset -- will be set by setBufferOffsets
9559 DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
9560 DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
9561 };
9562 setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
9563
9564 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_buffer_store ?
9566 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
9567 MemSDNode *M = cast<MemSDNode>(Op);
9568
9569 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
9570 EVT VDataType = VData.getValueType().getScalarType();
9571 if (VDataType == MVT::i8 || VDataType == MVT::i16)
9572 return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
9573
9574 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9575 M->getMemoryVT(), M->getMemOperand());
9576 }
9577
9578 case Intrinsic::amdgcn_raw_buffer_store:
9579 case Intrinsic::amdgcn_raw_ptr_buffer_store:
9580 case Intrinsic::amdgcn_raw_buffer_store_format:
9581 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
9582 const bool IsFormat =
9583 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
9584 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
9585
9586 SDValue VData = Op.getOperand(2);
9587 EVT VDataVT = VData.getValueType();
9588 EVT EltType = VDataVT.getScalarType();
9589 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
9590 if (IsD16) {
9591 VData = handleD16VData(VData, DAG);
9592 VDataVT = VData.getValueType();
9593 }
9594
9595 if (!isTypeLegal(VDataVT)) {
9596 VData =
9597 DAG.getNode(ISD::BITCAST, DL,
9598 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
9599 }
9600
9601 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9602 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
9603 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9604 SDValue Ops[] = {
9605 Chain,
9606 VData,
9607 Rsrc,
9608 DAG.getConstant(0, DL, MVT::i32), // vindex
9609 Offsets.first, // voffset
9610 SOffset, // soffset
9611 Offsets.second, // offset
9612 Op.getOperand(6), // cachepolicy, swizzled buffer
9613 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9614 };
9615 unsigned Opc =
9617 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
9618 MemSDNode *M = cast<MemSDNode>(Op);
9619
9620 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
9621 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
9622 return handleByteShortBufferStores(DAG, VDataVT, DL, Ops, M);
9623
9624 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9625 M->getMemoryVT(), M->getMemOperand());
9626 }
9627
9628 case Intrinsic::amdgcn_struct_buffer_store:
9629 case Intrinsic::amdgcn_struct_ptr_buffer_store:
9630 case Intrinsic::amdgcn_struct_buffer_store_format:
9631 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
9632 const bool IsFormat =
9633 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
9634 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
9635
9636 SDValue VData = Op.getOperand(2);
9637 EVT VDataVT = VData.getValueType();
9638 EVT EltType = VDataVT.getScalarType();
9639 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
9640
9641 if (IsD16) {
9642 VData = handleD16VData(VData, DAG);
9643 VDataVT = VData.getValueType();
9644 }
9645
9646 if (!isTypeLegal(VDataVT)) {
9647 VData =
9648 DAG.getNode(ISD::BITCAST, DL,
9649 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
9650 }
9651
9652 auto Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9653 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
9654 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
9655 SDValue Ops[] = {
9656 Chain,
9657 VData,
9658 Rsrc,
9659 Op.getOperand(4), // vindex
9660 Offsets.first, // voffset
9661 SOffset, // soffset
9662 Offsets.second, // offset
9663 Op.getOperand(7), // cachepolicy, swizzled buffer
9664 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9665 };
9666 unsigned Opc =
9668 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
9669 MemSDNode *M = cast<MemSDNode>(Op);
9670
9671 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
9672 EVT VDataType = VData.getValueType().getScalarType();
9673 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
9674 return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
9675
9676 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9677 M->getMemoryVT(), M->getMemOperand());
9678 }
9679 case Intrinsic::amdgcn_raw_buffer_load_lds:
9680 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
9681 case Intrinsic::amdgcn_struct_buffer_load_lds:
9682 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
9683 assert(!AMDGPU::isGFX12Plus(*Subtarget));
9684 unsigned Opc;
9685 bool HasVIndex =
9686 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
9687 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
9688 unsigned OpOffset = HasVIndex ? 1 : 0;
9689 SDValue VOffset = Op.getOperand(5 + OpOffset);
9690 bool HasVOffset = !isNullConstant(VOffset);
9691 unsigned Size = Op->getConstantOperandVal(4);
9692
9693 switch (Size) {
9694 default:
9695 return SDValue();
9696 case 1:
9697 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
9698 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
9699 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
9700 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
9701 break;
9702 case 2:
9703 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
9704 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
9705 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
9706 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
9707 break;
9708 case 4:
9709 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
9710 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
9711 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
9712 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
9713 break;
9714 }
9715
9716 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
9717
9719
9720 if (HasVIndex && HasVOffset)
9721 Ops.push_back(DAG.getBuildVector(MVT::v2i32, DL,
9722 { Op.getOperand(5), // VIndex
9723 VOffset }));
9724 else if (HasVIndex)
9725 Ops.push_back(Op.getOperand(5));
9726 else if (HasVOffset)
9727 Ops.push_back(VOffset);
9728
9729 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
9730 Ops.push_back(Rsrc);
9731 Ops.push_back(Op.getOperand(6 + OpOffset)); // soffset
9732 Ops.push_back(Op.getOperand(7 + OpOffset)); // imm offset
9733 unsigned Aux = Op.getConstantOperandVal(8 + OpOffset);
9734 Ops.push_back(
9735 DAG.getTargetConstant(Aux & AMDGPU::CPol::ALL, DL, MVT::i8)); // cpol
9737 Aux & AMDGPU::CPol::SWZ_pregfx12 ? 1 : 0, DL, MVT::i8)); // swz
9738 Ops.push_back(M0Val.getValue(0)); // Chain
9739 Ops.push_back(M0Val.getValue(1)); // Glue
9740
9741 auto *M = cast<MemSDNode>(Op);
9742 MachineMemOperand *LoadMMO = M->getMemOperand();
9743 // Don't set the offset value here because the pointer points to the base of
9744 // the buffer.
9745 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
9746
9747 MachinePointerInfo StorePtrI = LoadPtrI;
9748 LoadPtrI.V = PoisonValue::get(
9752
9753 auto F = LoadMMO->getFlags() &
9755 LoadMMO =
9757 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
9758
9760 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t),
9761 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
9762
9763 auto Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops);
9764 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
9765
9766 return SDValue(Load, 0);
9767 }
9768 case Intrinsic::amdgcn_global_load_lds: {
9769 unsigned Opc;
9770 unsigned Size = Op->getConstantOperandVal(4);
9771 switch (Size) {
9772 default:
9773 return SDValue();
9774 case 1:
9775 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
9776 break;
9777 case 2:
9778 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
9779 break;
9780 case 4:
9781 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
9782 break;
9783 }
9784
9785 auto *M = cast<MemSDNode>(Op);
9786 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
9787
9789
9790 SDValue Addr = Op.getOperand(2); // Global ptr
9791 SDValue VOffset;
9792 // Try to split SAddr and VOffset. Global and LDS pointers share the same
9793 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
9794 if (Addr->isDivergent() && Addr.getOpcode() == ISD::ADD) {
9795 SDValue LHS = Addr.getOperand(0);
9796 SDValue RHS = Addr.getOperand(1);
9797
9798 if (LHS->isDivergent())
9799 std::swap(LHS, RHS);
9800
9801 if (!LHS->isDivergent() && RHS.getOpcode() == ISD::ZERO_EXTEND &&
9802 RHS.getOperand(0).getValueType() == MVT::i32) {
9803 // add (i64 sgpr), (zero_extend (i32 vgpr))
9804 Addr = LHS;
9805 VOffset = RHS.getOperand(0);
9806 }
9807 }
9808
9809 Ops.push_back(Addr);
9810 if (!Addr->isDivergent()) {
9811 Opc = AMDGPU::getGlobalSaddrOp(Opc);
9812 if (!VOffset)
9813 VOffset = SDValue(
9814 DAG.getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32,
9815 DAG.getTargetConstant(0, DL, MVT::i32)), 0);
9816 Ops.push_back(VOffset);
9817 }
9818
9819 Ops.push_back(Op.getOperand(5)); // Offset
9820 Ops.push_back(Op.getOperand(6)); // CPol
9821 Ops.push_back(M0Val.getValue(0)); // Chain
9822 Ops.push_back(M0Val.getValue(1)); // Glue
9823
9824 MachineMemOperand *LoadMMO = M->getMemOperand();
9825 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
9826 LoadPtrI.Offset = Op->getConstantOperandVal(5);
9827 MachinePointerInfo StorePtrI = LoadPtrI;
9828 LoadPtrI.V = PoisonValue::get(
9832 auto F = LoadMMO->getFlags() &
9834 LoadMMO =
9836 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
9838 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t), Align(4),
9839 LoadMMO->getAAInfo());
9840
9841 auto Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
9842 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
9843
9844 return SDValue(Load, 0);
9845 }
9846 case Intrinsic::amdgcn_end_cf:
9847 return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
9848 Op->getOperand(2), Chain), 0);
9849 case Intrinsic::amdgcn_s_barrier_init:
9850 case Intrinsic::amdgcn_s_barrier_join:
9851 case Intrinsic::amdgcn_s_wakeup_barrier: {
9852 SDValue Chain = Op->getOperand(0);
9854 SDValue BarOp = Op->getOperand(2);
9855 unsigned Opc;
9856 bool IsInlinableBarID = false;
9857 int64_t BarVal;
9858
9859 if (isa<ConstantSDNode>(BarOp)) {
9860 BarVal = cast<ConstantSDNode>(BarOp)->getSExtValue();
9861 IsInlinableBarID = AMDGPU::isInlinableIntLiteral(BarVal);
9862 }
9863
9864 if (IsInlinableBarID) {
9865 switch (IntrinsicID) {
9866 default:
9867 return SDValue();
9868 case Intrinsic::amdgcn_s_barrier_init:
9869 Opc = AMDGPU::S_BARRIER_INIT_IMM;
9870 break;
9871 case Intrinsic::amdgcn_s_barrier_join:
9872 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
9873 break;
9874 case Intrinsic::amdgcn_s_wakeup_barrier:
9875 Opc = AMDGPU::S_WAKEUP_BARRIER_IMM;
9876 break;
9877 }
9878
9879 SDValue K = DAG.getTargetConstant(BarVal, DL, MVT::i32);
9880 Ops.push_back(K);
9881 } else {
9882 switch (IntrinsicID) {
9883 default:
9884 return SDValue();
9885 case Intrinsic::amdgcn_s_barrier_init:
9886 Opc = AMDGPU::S_BARRIER_INIT_M0;
9887 break;
9888 case Intrinsic::amdgcn_s_barrier_join:
9889 Opc = AMDGPU::S_BARRIER_JOIN_M0;
9890 break;
9891 case Intrinsic::amdgcn_s_wakeup_barrier:
9892 Opc = AMDGPU::S_WAKEUP_BARRIER_M0;
9893 break;
9894 }
9895 }
9896
9897 if (IntrinsicID == Intrinsic::amdgcn_s_barrier_init) {
9898 SDValue M0Val;
9899 // Member count will be read from M0[16:22]
9900 M0Val = DAG.getNode(ISD::SHL, DL, MVT::i32, Op.getOperand(3),
9901 DAG.getShiftAmountConstant(16, MVT::i32, DL));
9902
9903 if (!IsInlinableBarID) {
9904 // If reference to barrier id is not an inline constant then it must be
9905 // referenced with M0[4:0]. Perform an OR with the member count to
9906 // include it in M0.
9907 M0Val = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32,
9908 Op.getOperand(2), M0Val),
9909 0);
9910 }
9911 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
9912 } else if (!IsInlinableBarID) {
9913 Ops.push_back(copyToM0(DAG, Chain, DL, BarOp).getValue(0));
9914 }
9915
9916 auto NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
9917 return SDValue(NewMI, 0);
9918 }
9919 default: {
9920 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
9922 return lowerImage(Op, ImageDimIntr, DAG, true);
9923
9924 return Op;
9925 }
9926 }
9927}
9928
9929// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
9930// offset (the offset that is included in bounds checking and swizzling, to be
9931// split between the instruction's voffset and immoffset fields) and soffset
9932// (the offset that is excluded from bounds checking and swizzling, to go in
9933// the instruction's soffset field). This function takes the first kind of
9934// offset and figures out how to split it between voffset and immoffset.
9935std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets(
9936 SDValue Offset, SelectionDAG &DAG) const {
9937 SDLoc DL(Offset);
9938 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
9939 SDValue N0 = Offset;
9940 ConstantSDNode *C1 = nullptr;
9941
9942 if ((C1 = dyn_cast<ConstantSDNode>(N0)))
9943 N0 = SDValue();
9944 else if (DAG.isBaseWithConstantOffset(N0)) {
9945 C1 = cast<ConstantSDNode>(N0.getOperand(1));
9946 N0 = N0.getOperand(0);
9947 }
9948
9949 if (C1) {
9950 unsigned ImmOffset = C1->getZExtValue();
9951 // If the immediate value is too big for the immoffset field, put only bits
9952 // that would normally fit in the immoffset field. The remaining value that
9953 // is copied/added for the voffset field is a large power of 2, and it
9954 // stands more chance of being CSEd with the copy/add for another similar
9955 // load/store.
9956 // However, do not do that rounding down if that is a negative
9957 // number, as it appears to be illegal to have a negative offset in the
9958 // vgpr, even if adding the immediate offset makes it positive.
9959 unsigned Overflow = ImmOffset & ~MaxImm;
9960 ImmOffset -= Overflow;
9961 if ((int32_t)Overflow < 0) {
9962 Overflow += ImmOffset;
9963 ImmOffset = 0;
9964 }
9965 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(ImmOffset, DL, MVT::i32));
9966 if (Overflow) {
9967 auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32);
9968 if (!N0)
9969 N0 = OverflowVal;
9970 else {
9971 SDValue Ops[] = { N0, OverflowVal };
9972 N0 = DAG.getNode(ISD::ADD, DL, MVT::i32, Ops);
9973 }
9974 }
9975 }
9976 if (!N0)
9977 N0 = DAG.getConstant(0, DL, MVT::i32);
9978 if (!C1)
9979 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(0, DL, MVT::i32));
9980 return {N0, SDValue(C1, 0)};
9981}
9982
9983// Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the
9984// three offsets (voffset, soffset and instoffset) into the SDValue[3] array
9985// pointed to by Offsets.
9986void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
9987 SelectionDAG &DAG, SDValue *Offsets,
9988 Align Alignment) const {
9990 SDLoc DL(CombinedOffset);
9991 if (auto *C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
9992 uint32_t Imm = C->getZExtValue();
9993 uint32_t SOffset, ImmOffset;
9994 if (TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
9995 Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
9996 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
9997 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
9998 return;
9999 }
10000 }
10001 if (DAG.isBaseWithConstantOffset(CombinedOffset)) {
10002 SDValue N0 = CombinedOffset.getOperand(0);
10003 SDValue N1 = CombinedOffset.getOperand(1);
10004 uint32_t SOffset, ImmOffset;
10005 int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
10006 if (Offset >= 0 &&
10007 TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) {
10008 Offsets[0] = N0;
10009 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
10010 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
10011 return;
10012 }
10013 }
10014
10015 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
10016 ? DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32)
10017 : DAG.getConstant(0, DL, MVT::i32);
10018
10019 Offsets[0] = CombinedOffset;
10020 Offsets[1] = SOffsetZero;
10021 Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32);
10022}
10023
10024SDValue SITargetLowering::bufferRsrcPtrToVector(SDValue MaybePointer,
10025 SelectionDAG &DAG) const {
10026 if (!MaybePointer.getValueType().isScalarInteger())
10027 return MaybePointer;
10028
10029 SDLoc DL(MaybePointer);
10030
10031 SDValue Rsrc = DAG.getBitcast(MVT::v4i32, MaybePointer);
10032 return Rsrc;
10033}
10034
10035// Wrap a global or flat pointer into a buffer intrinsic using the flags
10036// specified in the intrinsic.
10037SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op,
10038 SelectionDAG &DAG) const {
10039 SDLoc Loc(Op);
10040
10041 SDValue Pointer = Op->getOperand(1);
10042 SDValue Stride = Op->getOperand(2);
10043 SDValue NumRecords = Op->getOperand(3);
10044 SDValue Flags = Op->getOperand(4);
10045
10046 auto [LowHalf, HighHalf] = DAG.SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
10047 SDValue Mask = DAG.getConstant(0x0000ffff, Loc, MVT::i32);
10048 SDValue Masked = DAG.getNode(ISD::AND, Loc, MVT::i32, HighHalf, Mask);
10049 std::optional<uint32_t> ConstStride = std::nullopt;
10050 if (auto *ConstNode = dyn_cast<ConstantSDNode>(Stride))
10051 ConstStride = ConstNode->getZExtValue();
10052
10053 SDValue NewHighHalf = Masked;
10054 if (!ConstStride || *ConstStride != 0) {
10055 SDValue ShiftedStride;
10056 if (ConstStride) {
10057 ShiftedStride = DAG.getConstant(*ConstStride << 16, Loc, MVT::i32);
10058 } else {
10059 SDValue ExtStride = DAG.getAnyExtOrTrunc(Stride, Loc, MVT::i32);
10060 ShiftedStride =
10061 DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride,
10062 DAG.getShiftAmountConstant(16, MVT::i32, Loc));
10063 }
10064 NewHighHalf = DAG.getNode(ISD::OR, Loc, MVT::i32, Masked, ShiftedStride);
10065 }
10066
10067 SDValue Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v4i32, LowHalf,
10068 NewHighHalf, NumRecords, Flags);
10069 SDValue RsrcPtr = DAG.getNode(ISD::BITCAST, Loc, MVT::i128, Rsrc);
10070 return RsrcPtr;
10071}
10072
10073// Handle 8 bit and 16 bit buffer loads
10074SDValue
10075SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG, EVT LoadVT,
10077 MachineMemOperand *MMO) const {
10078 EVT IntVT = LoadVT.changeTypeToInteger();
10079 unsigned Opc = (LoadVT.getScalarType() == MVT::i8) ?
10081
10082 SDVTList ResList = DAG.getVTList(MVT::i32, MVT::Other);
10083 SDValue BufferLoad =
10084 DAG.getMemIntrinsicNode(Opc, DL, ResList, Ops, IntVT, MMO);
10085 SDValue LoadVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, BufferLoad);
10086 LoadVal = DAG.getNode(ISD::BITCAST, DL, LoadVT, LoadVal);
10087
10088 return DAG.getMergeValues({LoadVal, BufferLoad.getValue(1)}, DL);
10089}
10090
10091// Handle 8 bit and 16 bit buffer stores
10092SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG,
10093 EVT VDataType, SDLoc DL,
10094 SDValue Ops[],
10095 MemSDNode *M) const {
10096 if (VDataType == MVT::f16)
10097 Ops[1] = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Ops[1]);
10098
10099 SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]);
10100 Ops[1] = BufferStoreExt;
10101 unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE :
10103 ArrayRef<SDValue> OpsRef = ArrayRef(&Ops[0], 9);
10104 return DAG.getMemIntrinsicNode(Opc, DL, M->getVTList(), OpsRef, VDataType,
10105 M->getMemOperand());
10106}
10107
10109 ISD::LoadExtType ExtType, SDValue Op,
10110 const SDLoc &SL, EVT VT) {
10111 if (VT.bitsLT(Op.getValueType()))
10112 return DAG.getNode(ISD::TRUNCATE, SL, VT, Op);
10113
10114 switch (ExtType) {
10115 case ISD::SEXTLOAD:
10116 return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op);
10117 case ISD::ZEXTLOAD:
10118 return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op);
10119 case ISD::EXTLOAD:
10120 return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op);
10121 case ISD::NON_EXTLOAD:
10122 return Op;
10123 }
10124
10125 llvm_unreachable("invalid ext type");
10126}
10127
10128// Try to turn 8 and 16-bit scalar loads into SMEM eligible 32-bit loads.
10129// TODO: Skip this on GFX12 which does have scalar sub-dword loads.
10130SDValue SITargetLowering::widenLoad(LoadSDNode *Ld, DAGCombinerInfo &DCI) const {
10131 SelectionDAG &DAG = DCI.DAG;
10132 if (Ld->getAlign() < Align(4) || Ld->isDivergent())
10133 return SDValue();
10134
10135 // FIXME: Constant loads should all be marked invariant.
10136 unsigned AS = Ld->getAddressSpace();
10137 if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
10139 (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
10140 return SDValue();
10141
10142 // Don't do this early, since it may interfere with adjacent load merging for
10143 // illegal types. We can avoid losing alignment information for exotic types
10144 // pre-legalize.
10145 EVT MemVT = Ld->getMemoryVT();
10146 if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
10147 MemVT.getSizeInBits() >= 32)
10148 return SDValue();
10149
10150 SDLoc SL(Ld);
10151
10152 assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
10153 "unexpected vector extload");
10154
10155 // TODO: Drop only high part of range.
10156 SDValue Ptr = Ld->getBasePtr();
10157 SDValue NewLoad = DAG.getLoad(
10158 ISD::UNINDEXED, ISD::NON_EXTLOAD, MVT::i32, SL, Ld->getChain(), Ptr,
10159 Ld->getOffset(), Ld->getPointerInfo(), MVT::i32, Ld->getAlign(),
10160 Ld->getMemOperand()->getFlags(), Ld->getAAInfo(),
10161 nullptr); // Drop ranges
10162
10163 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
10164 if (MemVT.isFloatingPoint()) {
10166 "unexpected fp extload");
10167 TruncVT = MemVT.changeTypeToInteger();
10168 }
10169
10170 SDValue Cvt = NewLoad;
10171 if (Ld->getExtensionType() == ISD::SEXTLOAD) {
10172 Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad,
10173 DAG.getValueType(TruncVT));
10174 } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
10176 Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT);
10177 } else {
10179 }
10180
10181 EVT VT = Ld->getValueType(0);
10182 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
10183
10184 DCI.AddToWorklist(Cvt.getNode());
10185
10186 // We may need to handle exotic cases, such as i16->i64 extloads, so insert
10187 // the appropriate extension from the 32-bit load.
10188 Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT);
10189 DCI.AddToWorklist(Cvt.getNode());
10190
10191 // Handle conversion back to floating point if necessary.
10192 Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt);
10193
10194 return DAG.getMergeValues({ Cvt, NewLoad.getValue(1) }, SL);
10195}
10196
10198 const SIMachineFunctionInfo &Info) {
10199 // TODO: Should check if the address can definitely not access stack.
10200 if (Info.isEntryFunction())
10201 return Info.getUserSGPRInfo().hasFlatScratchInit();
10202 return true;
10203}
10204
10205SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
10206 SDLoc DL(Op);
10207 LoadSDNode *Load = cast<LoadSDNode>(Op);
10208 ISD::LoadExtType ExtType = Load->getExtensionType();
10209 EVT MemVT = Load->getMemoryVT();
10210
10211 if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
10212 if (MemVT == MVT::i16 && isTypeLegal(MVT::i16))
10213 return SDValue();
10214
10215 // FIXME: Copied from PPC
10216 // First, load into 32 bits, then truncate to 1 bit.
10217
10218 SDValue Chain = Load->getChain();
10219 SDValue BasePtr = Load->getBasePtr();
10220 MachineMemOperand *MMO = Load->getMemOperand();
10221
10222 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
10223
10224 SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain,
10225 BasePtr, RealMemVT, MMO);
10226
10227 if (!MemVT.isVector()) {
10228 SDValue Ops[] = {
10229 DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
10230 NewLD.getValue(1)
10231 };
10232
10233 return DAG.getMergeValues(Ops, DL);
10234 }
10235
10237 for (unsigned I = 0, N = MemVT.getVectorNumElements(); I != N; ++I) {
10238 SDValue Elt = DAG.getNode(ISD::SRL, DL, MVT::i32, NewLD,
10239 DAG.getConstant(I, DL, MVT::i32));
10240
10241 Elts.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Elt));
10242 }
10243
10244 SDValue Ops[] = {
10245 DAG.getBuildVector(MemVT, DL, Elts),
10246 NewLD.getValue(1)
10247 };
10248
10249 return DAG.getMergeValues(Ops, DL);
10250 }
10251
10252 if (!MemVT.isVector())
10253 return SDValue();
10254
10255 assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
10256 "Custom lowering for non-i32 vectors hasn't been implemented.");
10257
10258 Align Alignment = Load->getAlign();
10259 unsigned AS = Load->getAddressSpace();
10260 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
10261 Alignment.value() < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) {
10262 return SplitVectorLoad(Op, DAG);
10263 }
10264
10267 // If there is a possibility that flat instruction access scratch memory
10268 // then we need to use the same legalization rules we use for private.
10269 if (AS == AMDGPUAS::FLAT_ADDRESS &&
10271 AS = addressMayBeAccessedAsPrivate(Load->getMemOperand(), *MFI) ?
10273
10274 unsigned NumElements = MemVT.getVectorNumElements();
10275
10276 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
10278 if (!Op->isDivergent() && Alignment >= Align(4) && NumElements < 32) {
10279 if (MemVT.isPow2VectorType() ||
10280 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
10281 return SDValue();
10282 return WidenOrSplitVectorLoad(Op, DAG);
10283 }
10284 // Non-uniform loads will be selected to MUBUF instructions, so they
10285 // have the same legalization requirements as global and private
10286 // loads.
10287 //
10288 }
10289
10290 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
10293 if (Subtarget->getScalarizeGlobalBehavior() && !Op->isDivergent() &&
10294 Load->isSimple() && isMemOpHasNoClobberedMemOperand(Load) &&
10295 Alignment >= Align(4) && NumElements < 32) {
10296 if (MemVT.isPow2VectorType() ||
10297 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
10298 return SDValue();
10299 return WidenOrSplitVectorLoad(Op, DAG);
10300 }
10301 // Non-uniform loads will be selected to MUBUF instructions, so they
10302 // have the same legalization requirements as global and private
10303 // loads.
10304 //
10305 }
10306 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
10309 AS == AMDGPUAS::FLAT_ADDRESS) {
10310 if (NumElements > 4)
10311 return SplitVectorLoad(Op, DAG);
10312 // v3 loads not supported on SI.
10313 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
10314 return WidenOrSplitVectorLoad(Op, DAG);
10315
10316 // v3 and v4 loads are supported for private and global memory.
10317 return SDValue();
10318 }
10319 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
10320 // Depending on the setting of the private_element_size field in the
10321 // resource descriptor, we can only make private accesses up to a certain
10322 // size.
10323 switch (Subtarget->getMaxPrivateElementSize()) {
10324 case 4: {
10325 SDValue Ops[2];
10326 std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG);
10327 return DAG.getMergeValues(Ops, DL);
10328 }
10329 case 8:
10330 if (NumElements > 2)
10331 return SplitVectorLoad(Op, DAG);
10332 return SDValue();
10333 case 16:
10334 // Same as global/flat
10335 if (NumElements > 4)
10336 return SplitVectorLoad(Op, DAG);
10337 // v3 loads not supported on SI.
10338 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
10339 return WidenOrSplitVectorLoad(Op, DAG);
10340
10341 return SDValue();
10342 default:
10343 llvm_unreachable("unsupported private_element_size");
10344 }
10345 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
10346 unsigned Fast = 0;
10347 auto Flags = Load->getMemOperand()->getFlags();
10349 Load->getAlign(), Flags, &Fast) &&
10350 Fast > 1)
10351 return SDValue();
10352
10353 if (MemVT.isVector())
10354 return SplitVectorLoad(Op, DAG);
10355 }
10356
10358 MemVT, *Load->getMemOperand())) {
10359 SDValue Ops[2];
10360 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
10361 return DAG.getMergeValues(Ops, DL);
10362 }
10363
10364 return SDValue();
10365}
10366
10367SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
10368 EVT VT = Op.getValueType();
10369 if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256 ||
10370 VT.getSizeInBits() == 512)
10371 return splitTernaryVectorOp(Op, DAG);
10372
10373 assert(VT.getSizeInBits() == 64);
10374
10375 SDLoc DL(Op);
10376 SDValue Cond = Op.getOperand(0);
10377
10378 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
10379 SDValue One = DAG.getConstant(1, DL, MVT::i32);
10380
10381 SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
10382 SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
10383
10384 SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
10385 SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
10386
10387 SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
10388
10389 SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
10390 SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
10391
10392 SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
10393
10394 SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
10395 return DAG.getNode(ISD::BITCAST, DL, VT, Res);
10396}
10397
10398// Catch division cases where we can use shortcuts with rcp and rsq
10399// instructions.
10400SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
10401 SelectionDAG &DAG) const {
10402 SDLoc SL(Op);
10403 SDValue LHS = Op.getOperand(0);
10404 SDValue RHS = Op.getOperand(1);
10405 EVT VT = Op.getValueType();
10406 const SDNodeFlags Flags = Op->getFlags();
10407
10408 bool AllowInaccurateRcp = Flags.hasApproximateFuncs() ||
10410
10411 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
10412 // Without !fpmath accuracy information, we can't do more because we don't
10413 // know exactly whether rcp is accurate enough to meet !fpmath requirement.
10414 // f16 is always accurate enough
10415 if (!AllowInaccurateRcp && VT != MVT::f16)
10416 return SDValue();
10417
10418 if (CLHS->isExactlyValue(1.0)) {
10419 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
10420 // the CI documentation has a worst case error of 1 ulp.
10421 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
10422 // use it as long as we aren't trying to use denormals.
10423 //
10424 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
10425
10426 // 1.0 / sqrt(x) -> rsq(x)
10427
10428 // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
10429 // error seems really high at 2^29 ULP.
10430 // 1.0 / x -> rcp(x)
10431 return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
10432 }
10433
10434 // Same as for 1.0, but expand the sign out of the constant.
10435 if (CLHS->isExactlyValue(-1.0)) {
10436 // -1.0 / x -> rcp (fneg x)
10437 SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
10438 return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
10439 }
10440 }
10441
10442 // For f16 require afn or arcp.
10443 // For f32 require afn.
10444 if (!AllowInaccurateRcp && (VT != MVT::f16 || !Flags.hasAllowReciprocal()))
10445 return SDValue();
10446
10447 // Turn into multiply by the reciprocal.
10448 // x / y -> x * (1.0 / y)
10449 SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
10450 return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
10451}
10452
10453SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op,
10454 SelectionDAG &DAG) const {
10455 SDLoc SL(Op);
10456 SDValue X = Op.getOperand(0);
10457 SDValue Y = Op.getOperand(1);
10458 EVT VT = Op.getValueType();
10459 const SDNodeFlags Flags = Op->getFlags();
10460
10461 bool AllowInaccurateDiv = Flags.hasApproximateFuncs() ||
10463 if (!AllowInaccurateDiv)
10464 return SDValue();
10465
10466 SDValue NegY = DAG.getNode(ISD::FNEG, SL, VT, Y);
10467 SDValue One = DAG.getConstantFP(1.0, SL, VT);
10468
10469 SDValue R = DAG.getNode(AMDGPUISD::RCP, SL, VT, Y);
10470 SDValue Tmp0 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
10471
10472 R = DAG.getNode(ISD::FMA, SL, VT, Tmp0, R, R);
10473 SDValue Tmp1 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
10474 R = DAG.getNode(ISD::FMA, SL, VT, Tmp1, R, R);
10475 SDValue Ret = DAG.getNode(ISD::FMUL, SL, VT, X, R);
10476 SDValue Tmp2 = DAG.getNode(ISD::FMA, SL, VT, NegY, Ret, X);
10477 return DAG.getNode(ISD::FMA, SL, VT, Tmp2, R, Ret);
10478}
10479
10480static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
10481 EVT VT, SDValue A, SDValue B, SDValue GlueChain,
10482 SDNodeFlags Flags) {
10483 if (GlueChain->getNumValues() <= 1) {
10484 return DAG.getNode(Opcode, SL, VT, A, B, Flags);
10485 }
10486
10487 assert(GlueChain->getNumValues() == 3);
10488
10489 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
10490 switch (Opcode) {
10491 default: llvm_unreachable("no chain equivalent for opcode");
10492 case ISD::FMUL:
10493 Opcode = AMDGPUISD::FMUL_W_CHAIN;
10494 break;
10495 }
10496
10497 return DAG.getNode(Opcode, SL, VTList,
10498 {GlueChain.getValue(1), A, B, GlueChain.getValue(2)},
10499 Flags);
10500}
10501
10502static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
10503 EVT VT, SDValue A, SDValue B, SDValue C,
10504 SDValue GlueChain, SDNodeFlags Flags) {
10505 if (GlueChain->getNumValues() <= 1) {
10506 return DAG.getNode(Opcode, SL, VT, {A, B, C}, Flags);
10507 }
10508
10509 assert(GlueChain->getNumValues() == 3);
10510
10511 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
10512 switch (Opcode) {
10513 default: llvm_unreachable("no chain equivalent for opcode");
10514 case ISD::FMA:
10515 Opcode = AMDGPUISD::FMA_W_CHAIN;
10516 break;
10517 }
10518
10519 return DAG.getNode(Opcode, SL, VTList,
10520 {GlueChain.getValue(1), A, B, C, GlueChain.getValue(2)},
10521 Flags);
10522}
10523
10524SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
10525 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
10526 return FastLowered;
10527
10528 SDLoc SL(Op);
10529 SDValue Src0 = Op.getOperand(0);
10530 SDValue Src1 = Op.getOperand(1);
10531
10532 SDValue CvtSrc0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
10533 SDValue CvtSrc1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
10534
10535 SDValue RcpSrc1 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, CvtSrc1);
10536 SDValue Quot = DAG.getNode(ISD::FMUL, SL, MVT::f32, CvtSrc0, RcpSrc1);
10537
10538 SDValue FPRoundFlag = DAG.getTargetConstant(0, SL, MVT::i32);
10539 SDValue BestQuot = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot, FPRoundFlag);
10540
10541 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, BestQuot, Src1, Src0);
10542}
10543
10544// Faster 2.5 ULP division that does not support denormals.
10545SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
10546 SDNodeFlags Flags = Op->getFlags();
10547 SDLoc SL(Op);
10548 SDValue LHS = Op.getOperand(1);
10549 SDValue RHS = Op.getOperand(2);
10550
10551 SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS, Flags);
10552
10553 const APFloat K0Val(0x1p+96f);
10554 const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
10555
10556 const APFloat K1Val(0x1p-32f);
10557 const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
10558
10559 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
10560
10561 EVT SetCCVT =
10562 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
10563
10564 SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
10565
10566 SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One, Flags);
10567
10568 r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3, Flags);
10569
10570 // rcp does not support denormals.
10571 SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1, Flags);
10572
10573 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0, Flags);
10574
10575 return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul, Flags);
10576}
10577
10578// Returns immediate value for setting the F32 denorm mode when using the
10579// S_DENORM_MODE instruction.
10581 const SIMachineFunctionInfo *Info,
10582 const GCNSubtarget *ST) {
10583 assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE");
10584 uint32_t DPDenormModeDefault = Info->getMode().fpDenormModeDPValue();
10585 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
10586 return DAG.getTargetConstant(Mode, SDLoc(), MVT::i32);
10587}
10588
10589SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
10590 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
10591 return FastLowered;
10592
10593 // The selection matcher assumes anything with a chain selecting to a
10594 // mayRaiseFPException machine instruction. Since we're introducing a chain
10595 // here, we need to explicitly report nofpexcept for the regular fdiv
10596 // lowering.
10597 SDNodeFlags Flags = Op->getFlags();
10598 Flags.setNoFPExcept(true);
10599
10600 SDLoc SL(Op);
10601 SDValue LHS = Op.getOperand(0);
10602 SDValue RHS = Op.getOperand(1);
10603
10604 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
10605
10606 SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
10607
10608 SDValue DenominatorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
10609 {RHS, RHS, LHS}, Flags);
10610 SDValue NumeratorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
10611 {LHS, RHS, LHS}, Flags);
10612
10613 // Denominator is scaled to not be denormal, so using rcp is ok.
10614 SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32,
10615 DenominatorScaled, Flags);
10616 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32,
10617 DenominatorScaled, Flags);
10618
10619 using namespace AMDGPU::Hwreg;
10620 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
10621 const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i32);
10622
10623 const MachineFunction &MF = DAG.getMachineFunction();
10625 const DenormalMode DenormMode = Info->getMode().FP32Denormals;
10626
10627 const bool PreservesDenormals = DenormMode == DenormalMode::getIEEE();
10628 const bool HasDynamicDenormals =
10629 (DenormMode.Input == DenormalMode::Dynamic) ||
10630 (DenormMode.Output == DenormalMode::Dynamic);
10631
10632 SDValue SavedDenormMode;
10633
10634 if (!PreservesDenormals) {
10635 // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV
10636 // lowering. The chain dependence is insufficient, and we need glue. We do
10637 // not need the glue variants in a strictfp function.
10638
10639 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
10640
10641 SDValue Glue = DAG.getEntryNode();
10642 if (HasDynamicDenormals) {
10643 SDNode *GetReg = DAG.getMachineNode(AMDGPU::S_GETREG_B32, SL,
10644 DAG.getVTList(MVT::i32, MVT::Glue),
10645 {BitField, Glue});
10646 SavedDenormMode = SDValue(GetReg, 0);
10647
10648 Glue = DAG.getMergeValues(
10649 {DAG.getEntryNode(), SDValue(GetReg, 0), SDValue(GetReg, 1)}, SL);
10650 }
10651
10652 SDNode *EnableDenorm;
10653 if (Subtarget->hasDenormModeInst()) {
10654 const SDValue EnableDenormValue =
10655 getSPDenormModeValue(FP_DENORM_FLUSH_NONE, DAG, Info, Subtarget);
10656
10657 EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, Glue,
10658 EnableDenormValue)
10659 .getNode();
10660 } else {
10661 const SDValue EnableDenormValue = DAG.getConstant(FP_DENORM_FLUSH_NONE,
10662 SL, MVT::i32);
10663 EnableDenorm = DAG.getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
10664 {EnableDenormValue, BitField, Glue});
10665 }
10666
10667 SDValue Ops[3] = {
10668 NegDivScale0,
10669 SDValue(EnableDenorm, 0),
10670 SDValue(EnableDenorm, 1)
10671 };
10672
10673 NegDivScale0 = DAG.getMergeValues(Ops, SL);
10674 }
10675
10676 SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
10677 ApproxRcp, One, NegDivScale0, Flags);
10678
10679 SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
10680 ApproxRcp, Fma0, Flags);
10681
10682 SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled,
10683 Fma1, Fma1, Flags);
10684
10685 SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
10686 NumeratorScaled, Mul, Flags);
10687
10688 SDValue Fma3 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32,
10689 Fma2, Fma1, Mul, Fma2, Flags);
10690
10691 SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
10692 NumeratorScaled, Fma3, Flags);
10693
10694 if (!PreservesDenormals) {
10695 SDNode *DisableDenorm;
10696 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
10697 const SDValue DisableDenormValue = getSPDenormModeValue(
10698 FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, Info, Subtarget);
10699
10700 DisableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, MVT::Other,
10701 Fma4.getValue(1), DisableDenormValue,
10702 Fma4.getValue(2)).getNode();
10703 } else {
10704 assert(HasDynamicDenormals == (bool)SavedDenormMode);
10705 const SDValue DisableDenormValue =
10706 HasDynamicDenormals
10707 ? SavedDenormMode
10708 : DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
10709
10710 DisableDenorm = DAG.getMachineNode(
10711 AMDGPU::S_SETREG_B32, SL, MVT::Other,
10712 {DisableDenormValue, BitField, Fma4.getValue(1), Fma4.getValue(2)});
10713 }
10714
10715 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
10716 SDValue(DisableDenorm, 0), DAG.getRoot());
10717 DAG.setRoot(OutputChain);
10718 }
10719
10720 SDValue Scale = NumeratorScaled.getValue(1);
10721 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
10722 {Fma4, Fma1, Fma3, Scale}, Flags);
10723
10724 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS, Flags);
10725}
10726
10727SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
10728 if (SDValue FastLowered = lowerFastUnsafeFDIV64(Op, DAG))
10729 return FastLowered;
10730
10731 SDLoc SL(Op);
10732 SDValue X = Op.getOperand(0);
10733 SDValue Y = Op.getOperand(1);
10734
10735 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
10736
10737 SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
10738
10739 SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
10740
10741 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
10742
10743 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
10744
10745 SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
10746
10747 SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
10748
10749 SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
10750
10751 SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
10752
10753 SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
10754 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
10755
10756 SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f64,
10757 NegDivScale0, Mul, DivScale1);
10758
10759 SDValue Scale;
10760
10761 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
10762 // Workaround a hardware bug on SI where the condition output from div_scale
10763 // is not usable.
10764
10765 const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
10766
10767 // Figure out if the scale to use for div_fmas.
10768 SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
10769 SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
10770 SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
10771 SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
10772
10773 SDValue NumHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
10774 SDValue DenHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
10775
10776 SDValue Scale0Hi
10777 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
10778 SDValue Scale1Hi
10779 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
10780
10781 SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
10782 SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
10783 Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
10784 } else {
10785 Scale = DivScale1.getValue(1);
10786 }
10787
10788 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64,
10789 Fma4, Fma3, Mul, Scale);
10790
10791 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
10792}
10793
10794SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
10795 EVT VT = Op.getValueType();
10796
10797 if (VT == MVT::f32)
10798 return LowerFDIV32(Op, DAG);
10799
10800 if (VT == MVT::f64)
10801 return LowerFDIV64(Op, DAG);
10802
10803 if (VT == MVT::f16)
10804 return LowerFDIV16(Op, DAG);
10805
10806 llvm_unreachable("Unexpected type for fdiv");
10807}
10808
10809SDValue SITargetLowering::LowerFFREXP(SDValue Op, SelectionDAG &DAG) const {
10810 SDLoc dl(Op);
10811 SDValue Val = Op.getOperand(0);
10812 EVT VT = Val.getValueType();
10813 EVT ResultExpVT = Op->getValueType(1);
10814 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
10815
10816 SDValue Mant = DAG.getNode(
10818 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_mant, dl, MVT::i32), Val);
10819
10820 SDValue Exp = DAG.getNode(
10821 ISD::INTRINSIC_WO_CHAIN, dl, InstrExpVT,
10822 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_exp, dl, MVT::i32), Val);
10823
10824 if (Subtarget->hasFractBug()) {
10825 SDValue Fabs = DAG.getNode(ISD::FABS, dl, VT, Val);
10826 SDValue Inf = DAG.getConstantFP(
10828
10829 SDValue IsFinite = DAG.getSetCC(dl, MVT::i1, Fabs, Inf, ISD::SETOLT);
10830 SDValue Zero = DAG.getConstant(0, dl, InstrExpVT);
10831 Exp = DAG.getNode(ISD::SELECT, dl, InstrExpVT, IsFinite, Exp, Zero);
10832 Mant = DAG.getNode(ISD::SELECT, dl, VT, IsFinite, Mant, Val);
10833 }
10834
10835 SDValue CastExp = DAG.getSExtOrTrunc(Exp, dl, ResultExpVT);
10836 return DAG.getMergeValues({Mant, CastExp}, dl);
10837}
10838
10839SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
10840 SDLoc DL(Op);
10841 StoreSDNode *Store = cast<StoreSDNode>(Op);
10842 EVT VT = Store->getMemoryVT();
10843
10844 if (VT == MVT::i1) {
10845 return DAG.getTruncStore(Store->getChain(), DL,
10846 DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
10847 Store->getBasePtr(), MVT::i1, Store->getMemOperand());
10848 }
10849
10850 assert(VT.isVector() &&
10851 Store->getValue().getValueType().getScalarType() == MVT::i32);
10852
10853 unsigned AS = Store->getAddressSpace();
10854 if (Subtarget->hasLDSMisalignedBug() &&
10855 AS == AMDGPUAS::FLAT_ADDRESS &&
10856 Store->getAlign().value() < VT.getStoreSize() && VT.getSizeInBits() > 32) {
10857 return SplitVectorStore(Op, DAG);
10858 }
10859
10862 // If there is a possibility that flat instruction access scratch memory
10863 // then we need to use the same legalization rules we use for private.
10864 if (AS == AMDGPUAS::FLAT_ADDRESS &&
10866 AS = addressMayBeAccessedAsPrivate(Store->getMemOperand(), *MFI) ?
10868
10869 unsigned NumElements = VT.getVectorNumElements();
10870 if (AS == AMDGPUAS::GLOBAL_ADDRESS ||
10871 AS == AMDGPUAS::FLAT_ADDRESS) {
10872 if (NumElements > 4)
10873 return SplitVectorStore(Op, DAG);
10874 // v3 stores not supported on SI.
10875 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
10876 return SplitVectorStore(Op, DAG);
10877
10879 VT, *Store->getMemOperand()))
10880 return expandUnalignedStore(Store, DAG);
10881
10882 return SDValue();
10883 } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
10884 switch (Subtarget->getMaxPrivateElementSize()) {
10885 case 4:
10886 return scalarizeVectorStore(Store, DAG);
10887 case 8:
10888 if (NumElements > 2)
10889 return SplitVectorStore(Op, DAG);
10890 return SDValue();
10891 case 16:
10892 if (NumElements > 4 ||
10893 (NumElements == 3 && !Subtarget->enableFlatScratch()))
10894 return SplitVectorStore(Op, DAG);
10895 return SDValue();
10896 default:
10897 llvm_unreachable("unsupported private_element_size");
10898 }
10899 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
10900 unsigned Fast = 0;
10901 auto Flags = Store->getMemOperand()->getFlags();
10903 Store->getAlign(), Flags, &Fast) &&
10904 Fast > 1)
10905 return SDValue();
10906
10907 if (VT.isVector())
10908 return SplitVectorStore(Op, DAG);
10909
10910 return expandUnalignedStore(Store, DAG);
10911 }
10912
10913 // Probably an invalid store. If so we'll end up emitting a selection error.
10914 return SDValue();
10915}
10916
10917// Avoid the full correct expansion for f32 sqrt when promoting from f16.
10918SDValue SITargetLowering::lowerFSQRTF16(SDValue Op, SelectionDAG &DAG) const {
10919 SDLoc SL(Op);
10920 assert(!Subtarget->has16BitInsts());
10921 SDNodeFlags Flags = Op->getFlags();
10922 SDValue Ext =
10923 DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op.getOperand(0), Flags);
10924
10925 SDValue SqrtID = DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, SL, MVT::i32);
10926 SDValue Sqrt =
10927 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::f32, SqrtID, Ext, Flags);
10928
10929 return DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Sqrt,
10930 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
10931}
10932
10933SDValue SITargetLowering::lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const {
10934 SDLoc DL(Op);
10935 SDNodeFlags Flags = Op->getFlags();
10936 MVT VT = Op.getValueType().getSimpleVT();
10937 const SDValue X = Op.getOperand(0);
10938
10939 if (allowApproxFunc(DAG, Flags)) {
10940 // Instruction is 1ulp but ignores denormals.
10941 return DAG.getNode(
10943 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32), X, Flags);
10944 }
10945
10946 SDValue ScaleThreshold = DAG.getConstantFP(0x1.0p-96f, DL, VT);
10947 SDValue NeedScale = DAG.getSetCC(DL, MVT::i1, X, ScaleThreshold, ISD::SETOLT);
10948
10949 SDValue ScaleUpFactor = DAG.getConstantFP(0x1.0p+32f, DL, VT);
10950
10951 SDValue ScaledX = DAG.getNode(ISD::FMUL, DL, VT, X, ScaleUpFactor, Flags);
10952
10953 SDValue SqrtX =
10954 DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledX, X, Flags);
10955
10956 SDValue SqrtS;
10957 if (needsDenormHandlingF32(DAG, X, Flags)) {
10958 SDValue SqrtID =
10959 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32);
10960 SqrtS = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, SqrtID, SqrtX, Flags);
10961
10962 SDValue SqrtSAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, SqrtS);
10963 SDValue SqrtSNextDownInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
10964 DAG.getConstant(-1, DL, MVT::i32));
10965 SDValue SqrtSNextDown = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextDownInt);
10966
10967 SDValue NegSqrtSNextDown =
10968 DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextDown, Flags);
10969
10970 SDValue SqrtVP =
10971 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
10972
10973 SDValue SqrtSNextUpInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
10974 DAG.getConstant(1, DL, MVT::i32));
10975 SDValue SqrtSNextUp = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextUpInt);
10976
10977 SDValue NegSqrtSNextUp = DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextUp, Flags);
10978 SDValue SqrtVS =
10979 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
10980
10981 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
10982 SDValue SqrtVPLE0 = DAG.getSetCC(DL, MVT::i1, SqrtVP, Zero, ISD::SETOLE);
10983
10984 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPLE0, SqrtSNextDown, SqrtS,
10985 Flags);
10986
10987 SDValue SqrtVPVSGT0 = DAG.getSetCC(DL, MVT::i1, SqrtVS, Zero, ISD::SETOGT);
10988 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPVSGT0, SqrtSNextUp, SqrtS,
10989 Flags);
10990 } else {
10991 SDValue SqrtR = DAG.getNode(AMDGPUISD::RSQ, DL, VT, SqrtX, Flags);
10992
10993 SqrtS = DAG.getNode(ISD::FMUL, DL, VT, SqrtX, SqrtR, Flags);
10994
10995 SDValue Half = DAG.getConstantFP(0.5f, DL, VT);
10996 SDValue SqrtH = DAG.getNode(ISD::FMUL, DL, VT, SqrtR, Half, Flags);
10997 SDValue NegSqrtH = DAG.getNode(ISD::FNEG, DL, VT, SqrtH, Flags);
10998
10999 SDValue SqrtE = DAG.getNode(ISD::FMA, DL, VT, NegSqrtH, SqrtS, Half, Flags);
11000 SqrtH = DAG.getNode(ISD::FMA, DL, VT, SqrtH, SqrtE, SqrtH, Flags);
11001 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtS, SqrtE, SqrtS, Flags);
11002
11003 SDValue NegSqrtS = DAG.getNode(ISD::FNEG, DL, VT, SqrtS, Flags);
11004 SDValue SqrtD =
11005 DAG.getNode(ISD::FMA, DL, VT, NegSqrtS, SqrtS, SqrtX, Flags);
11006 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtD, SqrtH, SqrtS, Flags);
11007 }
11008
11009 SDValue ScaleDownFactor = DAG.getConstantFP(0x1.0p-16f, DL, VT);
11010
11011 SDValue ScaledDown =
11012 DAG.getNode(ISD::FMUL, DL, VT, SqrtS, ScaleDownFactor, Flags);
11013
11014 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledDown, SqrtS, Flags);
11015 SDValue IsZeroOrInf =
11016 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
11017 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
11018
11019 return DAG.getNode(ISD::SELECT, DL, VT, IsZeroOrInf, SqrtX, SqrtS, Flags);
11020}
11021
11022SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const {
11023 // For double type, the SQRT and RSQ instructions don't have required
11024 // precision, we apply Goldschmidt's algorithm to improve the result:
11025 //
11026 // y0 = rsq(x)
11027 // g0 = x * y0
11028 // h0 = 0.5 * y0
11029 //
11030 // r0 = 0.5 - h0 * g0
11031 // g1 = g0 * r0 + g0
11032 // h1 = h0 * r0 + h0
11033 //
11034 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
11035 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
11036 // h2 = h1 * r1 + h1
11037 //
11038 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
11039 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
11040 //
11041 // sqrt(x) = g3
11042
11043 SDNodeFlags Flags = Op->getFlags();
11044
11045 SDLoc DL(Op);
11046
11047 SDValue X = Op.getOperand(0);
11048 SDValue ScaleConstant = DAG.getConstantFP(0x1.0p-767, DL, MVT::f64);
11049
11050 SDValue Scaling = DAG.getSetCC(DL, MVT::i1, X, ScaleConstant, ISD::SETOLT);
11051
11052 SDValue ZeroInt = DAG.getConstant(0, DL, MVT::i32);
11053
11054 // Scale up input if it is too small.
11055 SDValue ScaleUpFactor = DAG.getConstant(256, DL, MVT::i32);
11056 SDValue ScaleUp =
11057 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleUpFactor, ZeroInt);
11058 SDValue SqrtX = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, X, ScaleUp, Flags);
11059
11060 SDValue SqrtY = DAG.getNode(AMDGPUISD::RSQ, DL, MVT::f64, SqrtX);
11061
11062 SDValue SqrtS0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtX, SqrtY);
11063
11064 SDValue Half = DAG.getConstantFP(0.5, DL, MVT::f64);
11065 SDValue SqrtH0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtY, Half);
11066
11067 SDValue NegSqrtH0 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtH0);
11068 SDValue SqrtR0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtH0, SqrtS0, Half);
11069
11070 SDValue SqrtH1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtH0, SqrtR0, SqrtH0);
11071
11072 SDValue SqrtS1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtS0, SqrtR0, SqrtS0);
11073
11074 SDValue NegSqrtS1 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS1);
11075 SDValue SqrtD0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS1, SqrtS1, SqrtX);
11076
11077 SDValue SqrtS2 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD0, SqrtH1, SqrtS1);
11078
11079 SDValue NegSqrtS2 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS2);
11080 SDValue SqrtD1 =
11081 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS2, SqrtS2, SqrtX);
11082
11083 SDValue SqrtRet = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD1, SqrtH1, SqrtS2);
11084
11085 SDValue ScaleDownFactor = DAG.getConstant(-128, DL, MVT::i32);
11086 SDValue ScaleDown =
11087 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleDownFactor, ZeroInt);
11088 SqrtRet = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, SqrtRet, ScaleDown, Flags);
11089
11090 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
11091 // with finite only or nsz because rsq(+/-0) = +/-inf
11092
11093 // TODO: Check for DAZ and expand to subnormals
11094 SDValue IsZeroOrInf =
11095 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
11096 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
11097
11098 // If x is +INF, +0, or -0, use its original value
11099 return DAG.getNode(ISD::SELECT, DL, MVT::f64, IsZeroOrInf, SqrtX, SqrtRet,
11100 Flags);
11101}
11102
11103SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
11104 SDLoc DL(Op);
11105 EVT VT = Op.getValueType();
11106 SDValue Arg = Op.getOperand(0);
11107 SDValue TrigVal;
11108
11109 // Propagate fast-math flags so that the multiply we introduce can be folded
11110 // if Arg is already the result of a multiply by constant.
11111 auto Flags = Op->getFlags();
11112
11113 SDValue OneOver2Pi = DAG.getConstantFP(0.5 * numbers::inv_pi, DL, VT);
11114
11115 if (Subtarget->hasTrigReducedRange()) {
11116 SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
11117 TrigVal = DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal, Flags);
11118 } else {
11119 TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
11120 }
11121
11122 switch (Op.getOpcode()) {
11123 case ISD::FCOS:
11124 return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal, Flags);
11125 case ISD::FSIN:
11126 return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal, Flags);
11127 default:
11128 llvm_unreachable("Wrong trig opcode");
11129 }
11130}
11131
11132SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const {
11133 AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
11134 assert(AtomicNode->isCompareAndSwap());
11135 unsigned AS = AtomicNode->getAddressSpace();
11136
11137 // No custom lowering required for local address space
11139 return Op;
11140
11141 // Non-local address space requires custom lowering for atomic compare
11142 // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
11143 SDLoc DL(Op);
11144 SDValue ChainIn = Op.getOperand(0);
11145 SDValue Addr = Op.getOperand(1);
11146 SDValue Old = Op.getOperand(2);
11147 SDValue New = Op.getOperand(3);
11148 EVT VT = Op.getValueType();
11149 MVT SimpleVT = VT.getSimpleVT();
11150 MVT VecType = MVT::getVectorVT(SimpleVT, 2);
11151
11152 SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
11153 SDValue Ops[] = { ChainIn, Addr, NewOld };
11154
11155 return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL, Op->getVTList(),
11156 Ops, VT, AtomicNode->getMemOperand());
11157}
11158
11159//===----------------------------------------------------------------------===//
11160// Custom DAG optimizations
11161//===----------------------------------------------------------------------===//
11162
11163SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
11164 DAGCombinerInfo &DCI) const {
11165 EVT VT = N->getValueType(0);
11166 EVT ScalarVT = VT.getScalarType();
11167 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
11168 return SDValue();
11169
11170 SelectionDAG &DAG = DCI.DAG;
11171 SDLoc DL(N);
11172
11173 SDValue Src = N->getOperand(0);
11174 EVT SrcVT = Src.getValueType();
11175
11176 // TODO: We could try to match extracting the higher bytes, which would be
11177 // easier if i8 vectors weren't promoted to i32 vectors, particularly after
11178 // types are legalized. v4i8 -> v4f32 is probably the only case to worry
11179 // about in practice.
11180 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
11181 if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
11182 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, MVT::f32, Src);
11183 DCI.AddToWorklist(Cvt.getNode());
11184
11185 // For the f16 case, fold to a cast to f32 and then cast back to f16.
11186 if (ScalarVT != MVT::f32) {
11187 Cvt = DAG.getNode(ISD::FP_ROUND, DL, VT, Cvt,
11188 DAG.getTargetConstant(0, DL, MVT::i32));
11189 }
11190 return Cvt;
11191 }
11192 }
11193
11194 return SDValue();
11195}
11196
11197SDValue SITargetLowering::performFCopySignCombine(SDNode *N,
11198 DAGCombinerInfo &DCI) const {
11199 SDValue MagnitudeOp = N->getOperand(0);
11200 SDValue SignOp = N->getOperand(1);
11201 SelectionDAG &DAG = DCI.DAG;
11202 SDLoc DL(N);
11203
11204 // f64 fcopysign is really an f32 copysign on the high bits, so replace the
11205 // lower half with a copy.
11206 // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
11207 if (MagnitudeOp.getValueType() == MVT::f64) {
11208 SDValue MagAsVector = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, MagnitudeOp);
11209 SDValue MagLo =
11210 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
11211 DAG.getConstant(0, DL, MVT::i32));
11212 SDValue MagHi =
11213 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
11214 DAG.getConstant(1, DL, MVT::i32));
11215
11216 SDValue HiOp =
11217 DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOp);
11218
11219 SDValue Vector = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp);
11220
11221 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector);
11222 }
11223
11224 if (SignOp.getValueType() != MVT::f64)
11225 return SDValue();
11226
11227 // Reduce width of sign operand, we only need the highest bit.
11228 //
11229 // fcopysign f64:x, f64:y ->
11230 // fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1)
11231 // TODO: In some cases it might make sense to go all the way to f16.
11232 SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, SignOp);
11233 SDValue SignAsF32 =
11234 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector,
11235 DAG.getConstant(1, DL, MVT::i32));
11236
11237 return DAG.getNode(ISD::FCOPYSIGN, DL, N->getValueType(0), N->getOperand(0),
11238 SignAsF32);
11239}
11240
11241// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
11242// (shl (or x, c1), c2) -> add (shl x, c2), (shl c1, c2) iff x and c1 share no
11243// bits
11244
11245// This is a variant of
11246// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
11247//
11248// The normal DAG combiner will do this, but only if the add has one use since
11249// that would increase the number of instructions.
11250//
11251// This prevents us from seeing a constant offset that can be folded into a
11252// memory instruction's addressing mode. If we know the resulting add offset of
11253// a pointer can be folded into an addressing offset, we can replace the pointer
11254// operand with the add of new constant offset. This eliminates one of the uses,
11255// and may allow the remaining use to also be simplified.
11256//
11257SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
11258 unsigned AddrSpace,
11259 EVT MemVT,
11260 DAGCombinerInfo &DCI) const {
11261 SDValue N0 = N->getOperand(0);
11262 SDValue N1 = N->getOperand(1);
11263
11264 // We only do this to handle cases where it's profitable when there are
11265 // multiple uses of the add, so defer to the standard combine.
11266 if ((N0.getOpcode() != ISD::ADD && N0.getOpcode() != ISD::OR) ||
11267 N0->hasOneUse())
11268 return SDValue();
11269
11270 const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
11271 if (!CN1)
11272 return SDValue();
11273
11274 const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
11275 if (!CAdd)
11276 return SDValue();
11277
11278 SelectionDAG &DAG = DCI.DAG;
11279
11280 if (N0->getOpcode() == ISD::OR &&
11281 !DAG.haveNoCommonBitsSet(N0.getOperand(0), N0.getOperand(1)))
11282 return SDValue();
11283
11284 // If the resulting offset is too large, we can't fold it into the
11285 // addressing mode offset.
11286 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
11287 Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext());
11288
11289 AddrMode AM;
11290 AM.HasBaseReg = true;
11291 AM.BaseOffs = Offset.getSExtValue();
11292 if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace))
11293 return SDValue();
11294
11295 SDLoc SL(N);
11296 EVT VT = N->getValueType(0);
11297
11298 SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
11299 SDValue COffset = DAG.getConstant(Offset, SL, VT);
11300
11302 Flags.setNoUnsignedWrap(N->getFlags().hasNoUnsignedWrap() &&
11303 (N0.getOpcode() == ISD::OR ||
11304 N0->getFlags().hasNoUnsignedWrap()));
11305
11306 return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
11307}
11308
11309/// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset
11310/// by the chain and intrinsic ID. Theoretically we would also need to check the
11311/// specific intrinsic, but they all place the pointer operand first.
11312static unsigned getBasePtrIndex(const MemSDNode *N) {
11313 switch (N->getOpcode()) {
11314 case ISD::STORE:
11317 return 2;
11318 default:
11319 return 1;
11320 }
11321}
11322
11323SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
11324 DAGCombinerInfo &DCI) const {
11325 SelectionDAG &DAG = DCI.DAG;
11326 SDLoc SL(N);
11327
11328 unsigned PtrIdx = getBasePtrIndex(N);
11329 SDValue Ptr = N->getOperand(PtrIdx);
11330
11331 // TODO: We could also do this for multiplies.
11332 if (Ptr.getOpcode() == ISD::SHL) {
11333 SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(),
11334 N->getMemoryVT(), DCI);
11335 if (NewPtr) {
11336 SmallVector<SDValue, 8> NewOps(N->op_begin(), N->op_end());
11337
11338 NewOps[PtrIdx] = NewPtr;
11339 return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
11340 }
11341 }
11342
11343 return SDValue();
11344}
11345
11346static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
11347 return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
11348 (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
11349 (Opc == ISD::XOR && Val == 0);
11350}
11351
11352// Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
11353// will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
11354// integer combine opportunities since most 64-bit operations are decomposed
11355// this way. TODO: We won't want this for SALU especially if it is an inline
11356// immediate.
11357SDValue SITargetLowering::splitBinaryBitConstantOp(
11358 DAGCombinerInfo &DCI,
11359 const SDLoc &SL,
11360 unsigned Opc, SDValue LHS,
11361 const ConstantSDNode *CRHS) const {
11362 uint64_t Val = CRHS->getZExtValue();
11363 uint32_t ValLo = Lo_32(Val);
11364 uint32_t ValHi = Hi_32(Val);
11366
11367 if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
11368 bitOpWithConstantIsReducible(Opc, ValHi)) ||
11369 (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
11370 // If we need to materialize a 64-bit immediate, it will be split up later
11371 // anyway. Avoid creating the harder to understand 64-bit immediate
11372 // materialization.
11373 return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
11374 }
11375
11376 return SDValue();
11377}
11378
11380 if (V.getValueType() != MVT::i1)
11381 return false;
11382 switch (V.getOpcode()) {
11383 default:
11384 break;
11385 case ISD::SETCC:
11387 return true;
11388 case ISD::AND:
11389 case ISD::OR:
11390 case ISD::XOR:
11391 return isBoolSGPR(V.getOperand(0)) && isBoolSGPR(V.getOperand(1));
11392 }
11393 return false;
11394}
11395
11396// If a constant has all zeroes or all ones within each byte return it.
11397// Otherwise return 0.
11399 // 0xff for any zero byte in the mask
11400 uint32_t ZeroByteMask = 0;
11401 if (!(C & 0x000000ff)) ZeroByteMask |= 0x000000ff;
11402 if (!(C & 0x0000ff00)) ZeroByteMask |= 0x0000ff00;
11403 if (!(C & 0x00ff0000)) ZeroByteMask |= 0x00ff0000;
11404 if (!(C & 0xff000000)) ZeroByteMask |= 0xff000000;
11405 uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
11406 if ((NonZeroByteMask & C) != NonZeroByteMask)
11407 return 0; // Partial bytes selected.
11408 return C;
11409}
11410
11411// Check if a node selects whole bytes from its operand 0 starting at a byte
11412// boundary while masking the rest. Returns select mask as in the v_perm_b32
11413// or -1 if not succeeded.
11414// Note byte select encoding:
11415// value 0-3 selects corresponding source byte;
11416// value 0xc selects zero;
11417// value 0xff selects 0xff.
11419 assert(V.getValueSizeInBits() == 32);
11420
11421 if (V.getNumOperands() != 2)
11422 return ~0;
11423
11424 ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(V.getOperand(1));
11425 if (!N1)
11426 return ~0;
11427
11428 uint32_t C = N1->getZExtValue();
11429
11430 switch (V.getOpcode()) {
11431 default:
11432 break;
11433 case ISD::AND:
11434 if (uint32_t ConstMask = getConstantPermuteMask(C))
11435 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
11436 break;
11437
11438 case ISD::OR:
11439 if (uint32_t ConstMask = getConstantPermuteMask(C))
11440 return (0x03020100 & ~ConstMask) | ConstMask;
11441 break;
11442
11443 case ISD::SHL:
11444 if (C % 8)
11445 return ~0;
11446
11447 return uint32_t((0x030201000c0c0c0cull << C) >> 32);
11448
11449 case ISD::SRL:
11450 if (C % 8)
11451 return ~0;
11452
11453 return uint32_t(0x0c0c0c0c03020100ull >> C);
11454 }
11455
11456 return ~0;
11457}
11458
11459SDValue SITargetLowering::performAndCombine(SDNode *N,
11460 DAGCombinerInfo &DCI) const {
11461 if (DCI.isBeforeLegalize())
11462 return SDValue();
11463
11464 SelectionDAG &DAG = DCI.DAG;
11465 EVT VT = N->getValueType(0);
11466 SDValue LHS = N->getOperand(0);
11467 SDValue RHS = N->getOperand(1);
11468
11469
11470 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
11471 if (VT == MVT::i64 && CRHS) {
11472 if (SDValue Split
11473 = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
11474 return Split;
11475 }
11476
11477 if (CRHS && VT == MVT::i32) {
11478 // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
11479 // nb = number of trailing zeroes in mask
11480 // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
11481 // given that we are selecting 8 or 16 bit fields starting at byte boundary.
11482 uint64_t Mask = CRHS->getZExtValue();
11483 unsigned Bits = llvm::popcount(Mask);
11484 if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
11485 (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {
11486 if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
11487 unsigned Shift = CShift->getZExtValue();
11488 unsigned NB = CRHS->getAPIntValue().countr_zero();
11489 unsigned Offset = NB + Shift;
11490 if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
11491 SDLoc SL(N);
11492 SDValue BFE = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
11493 LHS->getOperand(0),
11494 DAG.getConstant(Offset, SL, MVT::i32),
11495 DAG.getConstant(Bits, SL, MVT::i32));
11496 EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
11497 SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,
11498 DAG.getValueType(NarrowVT));
11499 SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
11500 DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
11501 return Shl;
11502 }
11503 }
11504 }
11505
11506 // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
11507 if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
11508 isa<ConstantSDNode>(LHS.getOperand(2))) {
11509 uint32_t Sel = getConstantPermuteMask(Mask);
11510 if (!Sel)
11511 return SDValue();
11512
11513 // Select 0xc for all zero bytes
11514 Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
11515 SDLoc DL(N);
11516 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
11517 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
11518 }
11519 }
11520
11521 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
11522 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
11523 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
11524 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
11525 ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
11526
11527 SDValue X = LHS.getOperand(0);
11528 SDValue Y = RHS.getOperand(0);
11529 if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X ||
11530 !isTypeLegal(X.getValueType()))
11531 return SDValue();
11532
11533 if (LCC == ISD::SETO) {
11534 if (X != LHS.getOperand(1))
11535 return SDValue();
11536
11537 if (RCC == ISD::SETUNE) {
11538 const ConstantFPSDNode *C1 = dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
11539 if (!C1 || !C1->isInfinity() || C1->isNegative())
11540 return SDValue();
11541
11548
11549 static_assert(((~(SIInstrFlags::S_NAN |
11552 SIInstrFlags::P_INFINITY)) & 0x3ff) == Mask,
11553 "mask not equal");
11554
11555 SDLoc DL(N);
11556 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
11557 X, DAG.getConstant(Mask, DL, MVT::i32));
11558 }
11559 }
11560 }
11561
11562 if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS)
11563 std::swap(LHS, RHS);
11564
11565 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
11566 RHS.hasOneUse()) {
11567 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
11568 // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan | n_nan)
11569 // and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan | n_nan)
11570 const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
11571 if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask &&
11572 (RHS.getOperand(0) == LHS.getOperand(0) &&
11573 LHS.getOperand(0) == LHS.getOperand(1))) {
11574 const unsigned OrdMask = SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN;
11575 unsigned NewMask = LCC == ISD::SETO ?
11576 Mask->getZExtValue() & ~OrdMask :
11577 Mask->getZExtValue() & OrdMask;
11578
11579 SDLoc DL(N);
11580 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, RHS.getOperand(0),
11581 DAG.getConstant(NewMask, DL, MVT::i32));
11582 }
11583 }
11584
11585 if (VT == MVT::i32 &&
11586 (RHS.getOpcode() == ISD::SIGN_EXTEND || LHS.getOpcode() == ISD::SIGN_EXTEND)) {
11587 // and x, (sext cc from i1) => select cc, x, 0
11588 if (RHS.getOpcode() != ISD::SIGN_EXTEND)
11589 std::swap(LHS, RHS);
11590 if (isBoolSGPR(RHS.getOperand(0)))
11591 return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0),
11592 LHS, DAG.getConstant(0, SDLoc(N), MVT::i32));
11593 }
11594
11595 // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
11597 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
11598 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
11599 uint32_t LHSMask = getPermuteMask(LHS);
11600 uint32_t RHSMask = getPermuteMask(RHS);
11601 if (LHSMask != ~0u && RHSMask != ~0u) {
11602 // Canonicalize the expression in an attempt to have fewer unique masks
11603 // and therefore fewer registers used to hold the masks.
11604 if (LHSMask > RHSMask) {
11605 std::swap(LHSMask, RHSMask);
11606 std::swap(LHS, RHS);
11607 }
11608
11609 // Select 0xc for each lane used from source operand. Zero has 0xc mask
11610 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
11611 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
11612 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
11613
11614 // Check of we need to combine values from two sources within a byte.
11615 if (!(LHSUsedLanes & RHSUsedLanes) &&
11616 // If we select high and lower word keep it for SDWA.
11617 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
11618 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
11619 // Each byte in each mask is either selector mask 0-3, or has higher
11620 // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
11621 // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
11622 // mask which is not 0xff wins. By anding both masks we have a correct
11623 // result except that 0x0c shall be corrected to give 0x0c only.
11624 uint32_t Mask = LHSMask & RHSMask;
11625 for (unsigned I = 0; I < 32; I += 8) {
11626 uint32_t ByteSel = 0xff << I;
11627 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
11628 Mask &= (0x0c << I) & 0xffffffff;
11629 }
11630
11631 // Add 4 to each active LHS lane. It will not affect any existing 0xff
11632 // or 0x0c.
11633 uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
11634 SDLoc DL(N);
11635
11636 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32,
11637 LHS.getOperand(0), RHS.getOperand(0),
11638 DAG.getConstant(Sel, DL, MVT::i32));
11639 }
11640 }
11641 }
11642
11643 return SDValue();
11644}
11645
11646// A key component of v_perm is a mapping between byte position of the src
11647// operands, and the byte position of the dest. To provide such, we need: 1. the
11648// node that provides x byte of the dest of the OR, and 2. the byte of the node
11649// used to provide that x byte. calculateByteProvider finds which node provides
11650// a certain byte of the dest of the OR, and calculateSrcByte takes that node,
11651// and finds an ultimate src and byte position For example: The supported
11652// LoadCombine pattern for vector loads is as follows
11653// t1
11654// or
11655// / \
11656// t2 t3
11657// zext shl
11658// | | \
11659// t4 t5 16
11660// or anyext
11661// / \ |
11662// t6 t7 t8
11663// srl shl or
11664// / | / \ / \
11665// t9 t10 t11 t12 t13 t14
11666// trunc* 8 trunc* 8 and and
11667// | | / | | \
11668// t15 t16 t17 t18 t19 t20
11669// trunc* 255 srl -256
11670// | / \
11671// t15 t15 16
11672//
11673// *In this example, the truncs are from i32->i16
11674//
11675// calculateByteProvider would find t6, t7, t13, and t14 for bytes 0-3
11676// respectively. calculateSrcByte would find (given node) -> ultimate src &
11677// byteposition: t6 -> t15 & 1, t7 -> t16 & 0, t13 -> t15 & 0, t14 -> t15 & 3.
11678// After finding the mapping, we can combine the tree into vperm t15, t16,
11679// 0x05000407
11680
11681// Find the source and byte position from a node.
11682// \p DestByte is the byte position of the dest of the or that the src
11683// ultimately provides. \p SrcIndex is the byte of the src that maps to this
11684// dest of the or byte. \p Depth tracks how many recursive iterations we have
11685// performed.
11686static const std::optional<ByteProvider<SDValue>>
11687calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
11688 unsigned Depth = 0) {
11689 // We may need to recursively traverse a series of SRLs
11690 if (Depth >= 6)
11691 return std::nullopt;
11692
11693 if (Op.getValueSizeInBits() < 8)
11694 return std::nullopt;
11695
11696 if (Op.getValueType().isVector())
11697 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
11698
11699 switch (Op->getOpcode()) {
11700 case ISD::TRUNCATE: {
11701 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
11702 }
11703
11704 case ISD::SIGN_EXTEND:
11705 case ISD::ZERO_EXTEND:
11707 SDValue NarrowOp = Op->getOperand(0);
11708 auto NarrowVT = NarrowOp.getValueType();
11709 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG) {
11710 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
11711 NarrowVT = VTSign->getVT();
11712 }
11713 if (!NarrowVT.isByteSized())
11714 return std::nullopt;
11715 uint64_t NarrowByteWidth = NarrowVT.getStoreSize();
11716
11717 if (SrcIndex >= NarrowByteWidth)
11718 return std::nullopt;
11719 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
11720 }
11721
11722 case ISD::SRA:
11723 case ISD::SRL: {
11724 auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
11725 if (!ShiftOp)
11726 return std::nullopt;
11727
11728 uint64_t BitShift = ShiftOp->getZExtValue();
11729
11730 if (BitShift % 8 != 0)
11731 return std::nullopt;
11732
11733 SrcIndex += BitShift / 8;
11734
11735 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
11736 }
11737
11738 default: {
11739 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
11740 }
11741 }
11742 llvm_unreachable("fully handled switch");
11743}
11744
11745// For a byte position in the result of an Or, traverse the tree and find the
11746// node (and the byte of the node) which ultimately provides this {Or,
11747// BytePosition}. \p Op is the operand we are currently examining. \p Index is
11748// the byte position of the Op that corresponds with the originally requested
11749// byte of the Or \p Depth tracks how many recursive iterations we have
11750// performed. \p StartingIndex is the originally requested byte of the Or
11751static const std::optional<ByteProvider<SDValue>>
11752calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
11753 unsigned StartingIndex = 0) {
11754 // Finding Src tree of RHS of or typically requires at least 1 additional
11755 // depth
11756 if (Depth > 6)
11757 return std::nullopt;
11758
11759 unsigned BitWidth = Op.getScalarValueSizeInBits();
11760 if (BitWidth % 8 != 0)
11761 return std::nullopt;
11762 if (Index > BitWidth / 8 - 1)
11763 return std::nullopt;
11764
11765 bool IsVec = Op.getValueType().isVector();
11766 switch (Op.getOpcode()) {
11767 case ISD::OR: {
11768 if (IsVec)
11769 return std::nullopt;
11770
11771 auto RHS = calculateByteProvider(Op.getOperand(1), Index, Depth + 1,
11772 StartingIndex);
11773 if (!RHS)
11774 return std::nullopt;
11775 auto LHS = calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
11776 StartingIndex);
11777 if (!LHS)
11778 return std::nullopt;
11779 // A well formed Or will have two ByteProviders for each byte, one of which
11780 // is constant zero
11781 if (!LHS->isConstantZero() && !RHS->isConstantZero())
11782 return std::nullopt;
11783 if (!LHS || LHS->isConstantZero())
11784 return RHS;
11785 if (!RHS || RHS->isConstantZero())
11786 return LHS;
11787 return std::nullopt;
11788 }
11789
11790 case ISD::AND: {
11791 if (IsVec)
11792 return std::nullopt;
11793
11794 auto BitMaskOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
11795 if (!BitMaskOp)
11796 return std::nullopt;
11797
11798 uint32_t BitMask = BitMaskOp->getZExtValue();
11799 // Bits we expect for our StartingIndex
11800 uint32_t IndexMask = 0xFF << (Index * 8);
11801
11802 if ((IndexMask & BitMask) != IndexMask) {
11803 // If the result of the and partially provides the byte, then it
11804 // is not well formatted
11805 if (IndexMask & BitMask)
11806 return std::nullopt;
11808 }
11809
11810 return calculateSrcByte(Op->getOperand(0), StartingIndex, Index);
11811 }
11812
11813 case ISD::FSHR: {
11814 if (IsVec)
11815 return std::nullopt;
11816
11817 // fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW))
11818 auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(2));
11819 if (!ShiftOp || Op.getValueType().isVector())
11820 return std::nullopt;
11821
11822 uint64_t BitsProvided = Op.getValueSizeInBits();
11823 if (BitsProvided % 8 != 0)
11824 return std::nullopt;
11825
11826 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
11827 if (BitShift % 8)
11828 return std::nullopt;
11829
11830 uint64_t ConcatSizeInBytes = BitsProvided / 4;
11831 uint64_t ByteShift = BitShift / 8;
11832
11833 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
11834 uint64_t BytesProvided = BitsProvided / 8;
11835 SDValue NextOp = Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
11836 NewIndex %= BytesProvided;
11837 return calculateByteProvider(NextOp, NewIndex, Depth + 1, StartingIndex);
11838 }
11839
11840 case ISD::SRA:
11841 case ISD::SRL: {
11842 if (IsVec)
11843 return std::nullopt;
11844
11845 auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
11846 if (!ShiftOp)
11847 return std::nullopt;
11848
11849 uint64_t BitShift = ShiftOp->getZExtValue();
11850 if (BitShift % 8)
11851 return std::nullopt;
11852
11853 auto BitsProvided = Op.getScalarValueSizeInBits();
11854 if (BitsProvided % 8 != 0)
11855 return std::nullopt;
11856
11857 uint64_t BytesProvided = BitsProvided / 8;
11858 uint64_t ByteShift = BitShift / 8;
11859 // The dest of shift will have good [0 : (BytesProvided - ByteShift)] bytes.
11860 // If the byte we are trying to provide (as tracked by index) falls in this
11861 // range, then the SRL provides the byte. The byte of interest of the src of
11862 // the SRL is Index + ByteShift
11863 return BytesProvided - ByteShift > Index
11864 ? calculateSrcByte(Op->getOperand(0), StartingIndex,
11865 Index + ByteShift)
11867 }
11868
11869 case ISD::SHL: {
11870 if (IsVec)
11871 return std::nullopt;
11872
11873 auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
11874 if (!ShiftOp)
11875 return std::nullopt;
11876
11877 uint64_t BitShift = ShiftOp->getZExtValue();
11878 if (BitShift % 8 != 0)
11879 return std::nullopt;
11880 uint64_t ByteShift = BitShift / 8;
11881
11882 // If we are shifting by an amount greater than (or equal to)
11883 // the index we are trying to provide, then it provides 0s. If not,
11884 // then this bytes are not definitively 0s, and the corresponding byte
11885 // of interest is Index - ByteShift of the src
11886 return Index < ByteShift
11888 : calculateByteProvider(Op.getOperand(0), Index - ByteShift,
11889 Depth + 1, StartingIndex);
11890 }
11891 case ISD::ANY_EXTEND:
11892 case ISD::SIGN_EXTEND:
11893 case ISD::ZERO_EXTEND:
11895 case ISD::AssertZext:
11896 case ISD::AssertSext: {
11897 if (IsVec)
11898 return std::nullopt;
11899
11900 SDValue NarrowOp = Op->getOperand(0);
11901 unsigned NarrowBitWidth = NarrowOp.getValueSizeInBits();
11902 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG ||
11903 Op->getOpcode() == ISD::AssertZext ||
11904 Op->getOpcode() == ISD::AssertSext) {
11905 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
11906 NarrowBitWidth = VTSign->getVT().getSizeInBits();
11907 }
11908 if (NarrowBitWidth % 8 != 0)
11909 return std::nullopt;
11910 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
11911
11912 if (Index >= NarrowByteWidth)
11913 return Op.getOpcode() == ISD::ZERO_EXTEND
11914 ? std::optional<ByteProvider<SDValue>>(
11916 : std::nullopt;
11917 return calculateByteProvider(NarrowOp, Index, Depth + 1, StartingIndex);
11918 }
11919
11920 case ISD::TRUNCATE: {
11921 if (IsVec)
11922 return std::nullopt;
11923
11924 uint64_t NarrowByteWidth = BitWidth / 8;
11925
11926 if (NarrowByteWidth >= Index) {
11927 return calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
11928 StartingIndex);
11929 }
11930
11931 return std::nullopt;
11932 }
11933
11934 case ISD::CopyFromReg: {
11935 if (BitWidth / 8 > Index)
11936 return calculateSrcByte(Op, StartingIndex, Index);
11937
11938 return std::nullopt;
11939 }
11940
11941 case ISD::LOAD: {
11942 auto L = cast<LoadSDNode>(Op.getNode());
11943
11944 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
11945 if (NarrowBitWidth % 8 != 0)
11946 return std::nullopt;
11947 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
11948
11949 // If the width of the load does not reach byte we are trying to provide for
11950 // and it is not a ZEXTLOAD, then the load does not provide for the byte in
11951 // question
11952 if (Index >= NarrowByteWidth) {
11953 return L->getExtensionType() == ISD::ZEXTLOAD
11954 ? std::optional<ByteProvider<SDValue>>(
11956 : std::nullopt;
11957 }
11958
11959 if (NarrowByteWidth > Index) {
11960 return calculateSrcByte(Op, StartingIndex, Index);
11961 }
11962
11963 return std::nullopt;
11964 }
11965
11966 case ISD::BSWAP: {
11967 if (IsVec)
11968 return std::nullopt;
11969
11970 return calculateByteProvider(Op->getOperand(0), BitWidth / 8 - Index - 1,
11971 Depth + 1, StartingIndex);
11972 }
11973
11975 auto IdxOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
11976 if (!IdxOp)
11977 return std::nullopt;
11978 auto VecIdx = IdxOp->getZExtValue();
11979 auto ScalarSize = Op.getScalarValueSizeInBits();
11980 if (ScalarSize != 32) {
11981 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
11982 }
11983
11984 return calculateSrcByte(ScalarSize == 32 ? Op : Op.getOperand(0),
11985 StartingIndex, Index);
11986 }
11987
11988 case AMDGPUISD::PERM: {
11989 if (IsVec)
11990 return std::nullopt;
11991
11992 auto PermMask = dyn_cast<ConstantSDNode>(Op->getOperand(2));
11993 if (!PermMask)
11994 return std::nullopt;
11995
11996 auto IdxMask =
11997 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
11998 if (IdxMask > 0x07 && IdxMask != 0x0c)
11999 return std::nullopt;
12000
12001 auto NextOp = Op.getOperand(IdxMask > 0x03 ? 0 : 1);
12002 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
12003
12004 return IdxMask != 0x0c ? calculateSrcByte(NextOp, StartingIndex, NextIndex)
12007 }
12008
12009 default: {
12010 return std::nullopt;
12011 }
12012 }
12013
12014 llvm_unreachable("fully handled switch");
12015}
12016
12017// Returns true if the Operand is a scalar and is 16 bits
12018static bool isExtendedFrom16Bits(SDValue &Operand) {
12019
12020 switch (Operand.getOpcode()) {
12021 case ISD::ANY_EXTEND:
12022 case ISD::SIGN_EXTEND:
12023 case ISD::ZERO_EXTEND: {
12024 auto OpVT = Operand.getOperand(0).getValueType();
12025 return !OpVT.isVector() && OpVT.getSizeInBits() == 16;
12026 }
12027 case ISD::LOAD: {
12028 LoadSDNode *L = cast<LoadSDNode>(Operand.getNode());
12029 auto ExtType = cast<LoadSDNode>(L)->getExtensionType();
12030 if (ExtType == ISD::ZEXTLOAD || ExtType == ISD::SEXTLOAD ||
12031 ExtType == ISD::EXTLOAD) {
12032 auto MemVT = L->getMemoryVT();
12033 return !MemVT.isVector() && MemVT.getSizeInBits() == 16;
12034 }
12035 return L->getMemoryVT().getSizeInBits() == 16;
12036 }
12037 default:
12038 return false;
12039 }
12040}
12041
12042// Returns true if the mask matches consecutive bytes, and the first byte
12043// begins at a power of 2 byte offset from 0th byte
12044static bool addresses16Bits(int Mask) {
12045 int Low8 = Mask & 0xff;
12046 int Hi8 = (Mask & 0xff00) >> 8;
12047
12048 assert(Low8 < 8 && Hi8 < 8);
12049 // Are the bytes contiguous in the order of increasing addresses.
12050 bool IsConsecutive = (Hi8 - Low8 == 1);
12051 // Is the first byte at location that is aligned for 16 bit instructions.
12052 // A counter example is taking 2 consecutive bytes starting at the 8th bit.
12053 // In this case, we still need code to extract the 16 bit operand, so it
12054 // is better to use i8 v_perm
12055 bool Is16Aligned = !(Low8 % 2);
12056
12057 return IsConsecutive && Is16Aligned;
12058}
12059
12060// Do not lower into v_perm if the operands are actually 16 bit
12061// and the selected bits (based on PermMask) correspond with two
12062// easily addressable 16 bit operands.
12064 SDValue &OtherOp) {
12065 int Low16 = PermMask & 0xffff;
12066 int Hi16 = (PermMask & 0xffff0000) >> 16;
12067
12068 auto TempOp = peekThroughBitcasts(Op);
12069 auto TempOtherOp = peekThroughBitcasts(OtherOp);
12070
12071 auto OpIs16Bit =
12072 TempOtherOp.getValueSizeInBits() == 16 || isExtendedFrom16Bits(TempOp);
12073 if (!OpIs16Bit)
12074 return true;
12075
12076 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
12077 isExtendedFrom16Bits(TempOtherOp);
12078 if (!OtherOpIs16Bit)
12079 return true;
12080
12081 // Do we cleanly address both
12082 return !addresses16Bits(Low16) || !addresses16Bits(Hi16);
12083}
12084
12086 unsigned DWordOffset) {
12087 SDValue Ret;
12088
12089 auto TypeSize = Src.getValueSizeInBits().getFixedValue();
12090 // ByteProvider must be at least 8 bits
12091 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
12092
12093 if (TypeSize <= 32)
12094 return DAG.getBitcastedAnyExtOrTrunc(Src, SL, MVT::i32);
12095
12096 if (Src.getValueType().isVector()) {
12097 auto ScalarTySize = Src.getScalarValueSizeInBits();
12098 auto ScalarTy = Src.getValueType().getScalarType();
12099 if (ScalarTySize == 32) {
12100 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Src,
12101 DAG.getConstant(DWordOffset, SL, MVT::i32));
12102 }
12103 if (ScalarTySize > 32) {
12104 Ret = DAG.getNode(
12105 ISD::EXTRACT_VECTOR_ELT, SL, ScalarTy, Src,
12106 DAG.getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
12107 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
12108 if (ShiftVal)
12109 Ret = DAG.getNode(ISD::SRL, SL, Ret.getValueType(), Ret,
12110 DAG.getConstant(ShiftVal, SL, MVT::i32));
12111 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
12112 }
12113
12114 assert(ScalarTySize < 32);
12115 auto NumElements = TypeSize / ScalarTySize;
12116 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
12117 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
12118 auto NumElementsIn32 = 32 / ScalarTySize;
12119 auto NumAvailElements = DWordOffset < Trunc32Elements
12120 ? NumElementsIn32
12121 : NumElements - NormalizedTrunc;
12122
12124 DAG.ExtractVectorElements(Src, VecSrcs, DWordOffset * NumElementsIn32,
12125 NumAvailElements);
12126
12127 Ret = DAG.getBuildVector(
12128 MVT::getVectorVT(MVT::getIntegerVT(ScalarTySize), NumAvailElements), SL,
12129 VecSrcs);
12130 return Ret = DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
12131 }
12132
12133 /// Scalar Type
12134 auto ShiftVal = 32 * DWordOffset;
12135 Ret = DAG.getNode(ISD::SRL, SL, Src.getValueType(), Src,
12136 DAG.getConstant(ShiftVal, SL, MVT::i32));
12137 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
12138}
12139
12141 SelectionDAG &DAG = DCI.DAG;
12142 [[maybe_unused]] EVT VT = N->getValueType(0);
12144
12145 // VT is known to be MVT::i32, so we need to provide 4 bytes.
12146 assert(VT == MVT::i32);
12147 for (int i = 0; i < 4; i++) {
12148 // Find the ByteProvider that provides the ith byte of the result of OR
12149 std::optional<ByteProvider<SDValue>> P =
12150 calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i);
12151 // TODO support constantZero
12152 if (!P || P->isConstantZero())
12153 return SDValue();
12154
12155 PermNodes.push_back(*P);
12156 }
12157 if (PermNodes.size() != 4)
12158 return SDValue();
12159
12160 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
12161 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
12162 uint64_t PermMask = 0x00000000;
12163 for (size_t i = 0; i < PermNodes.size(); i++) {
12164 auto PermOp = PermNodes[i];
12165 // Since the mask is applied to Src1:Src2, Src1 bytes must be offset
12166 // by sizeof(Src2) = 4
12167 int SrcByteAdjust = 4;
12168
12169 // If the Src uses a byte from a different DWORD, then it corresponds
12170 // with a difference source
12171 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
12172 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
12173 if (SecondSrc)
12174 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
12175 ((PermOp.SrcOffset / 4) != SecondSrc->second))
12176 return SDValue();
12177
12178 // Set the index of the second distinct Src node
12179 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
12180 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
12181 SrcByteAdjust = 0;
12182 }
12183 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
12185 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
12186 }
12187 SDLoc DL(N);
12188 SDValue Op = *PermNodes[FirstSrc.first].Src;
12189 Op = getDWordFromOffset(DAG, DL, Op, FirstSrc.second);
12190 assert(Op.getValueSizeInBits() == 32);
12191
12192 // Check that we are not just extracting the bytes in order from an op
12193 if (!SecondSrc) {
12194 int Low16 = PermMask & 0xffff;
12195 int Hi16 = (PermMask & 0xffff0000) >> 16;
12196
12197 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
12198 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
12199
12200 // The perm op would really just produce Op. So combine into Op
12201 if (WellFormedLow && WellFormedHi)
12202 return DAG.getBitcast(MVT::getIntegerVT(32), Op);
12203 }
12204
12205 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src : Op;
12206
12207 if (SecondSrc) {
12208 OtherOp = getDWordFromOffset(DAG, DL, OtherOp, SecondSrc->second);
12209 assert(OtherOp.getValueSizeInBits() == 32);
12210 }
12211
12212 if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
12213
12214 assert(Op.getValueType().isByteSized() &&
12215 OtherOp.getValueType().isByteSized());
12216
12217 // If the ultimate src is less than 32 bits, then we will only be
12218 // using bytes 0: Op.getValueSizeInBytes() - 1 in the or.
12219 // CalculateByteProvider would not have returned Op as source if we
12220 // used a byte that is outside its ValueType. Thus, we are free to
12221 // ANY_EXTEND as the extended bits are dont-cares.
12222 Op = DAG.getBitcastedAnyExtOrTrunc(Op, DL, MVT::i32);
12223 OtherOp = DAG.getBitcastedAnyExtOrTrunc(OtherOp, DL, MVT::i32);
12224
12225 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp,
12226 DAG.getConstant(PermMask, DL, MVT::i32));
12227 }
12228 return SDValue();
12229}
12230
12231SDValue SITargetLowering::performOrCombine(SDNode *N,
12232 DAGCombinerInfo &DCI) const {
12233 SelectionDAG &DAG = DCI.DAG;
12234 SDValue LHS = N->getOperand(0);
12235 SDValue RHS = N->getOperand(1);
12236
12237 EVT VT = N->getValueType(0);
12238 if (VT == MVT::i1) {
12239 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
12240 if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
12241 RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
12242 SDValue Src = LHS.getOperand(0);
12243 if (Src != RHS.getOperand(0))
12244 return SDValue();
12245
12246 const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
12247 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
12248 if (!CLHS || !CRHS)
12249 return SDValue();
12250
12251 // Only 10 bits are used.
12252 static const uint32_t MaxMask = 0x3ff;
12253
12254 uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
12255 SDLoc DL(N);
12256 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
12257 Src, DAG.getConstant(NewMask, DL, MVT::i32));
12258 }
12259
12260 return SDValue();
12261 }
12262
12263 // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
12264 if (isa<ConstantSDNode>(RHS) && LHS.hasOneUse() &&
12265 LHS.getOpcode() == AMDGPUISD::PERM &&
12266 isa<ConstantSDNode>(LHS.getOperand(2))) {
12267 uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1));
12268 if (!Sel)
12269 return SDValue();
12270
12271 Sel |= LHS.getConstantOperandVal(2);
12272 SDLoc DL(N);
12273 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
12274 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
12275 }
12276
12277 // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
12279 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
12280 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
12281
12282 // If all the uses of an or need to extract the individual elements, do not
12283 // attempt to lower into v_perm
12284 auto usesCombinedOperand = [](SDNode *OrUse) {
12285 // If we have any non-vectorized use, then it is a candidate for v_perm
12286 if (OrUse->getOpcode() != ISD::BITCAST ||
12287 !OrUse->getValueType(0).isVector())
12288 return true;
12289
12290 // If we have any non-vectorized use, then it is a candidate for v_perm
12291 for (auto VUse : OrUse->uses()) {
12292 if (!VUse->getValueType(0).isVector())
12293 return true;
12294
12295 // If the use of a vector is a store, then combining via a v_perm
12296 // is beneficial.
12297 // TODO -- whitelist more uses
12298 for (auto VectorwiseOp : {ISD::STORE, ISD::CopyToReg, ISD::CopyFromReg})
12299 if (VUse->getOpcode() == VectorwiseOp)
12300 return true;
12301 }
12302 return false;
12303 };
12304
12305 if (!any_of(N->uses(), usesCombinedOperand))
12306 return SDValue();
12307
12308 uint32_t LHSMask = getPermuteMask(LHS);
12309 uint32_t RHSMask = getPermuteMask(RHS);
12310
12311 if (LHSMask != ~0u && RHSMask != ~0u) {
12312 // Canonicalize the expression in an attempt to have fewer unique masks
12313 // and therefore fewer registers used to hold the masks.
12314 if (LHSMask > RHSMask) {
12315 std::swap(LHSMask, RHSMask);
12316 std::swap(LHS, RHS);
12317 }
12318
12319 // Select 0xc for each lane used from source operand. Zero has 0xc mask
12320 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
12321 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12322 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12323
12324 // Check of we need to combine values from two sources within a byte.
12325 if (!(LHSUsedLanes & RHSUsedLanes) &&
12326 // If we select high and lower word keep it for SDWA.
12327 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
12328 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
12329 // Kill zero bytes selected by other mask. Zero value is 0xc.
12330 LHSMask &= ~RHSUsedLanes;
12331 RHSMask &= ~LHSUsedLanes;
12332 // Add 4 to each active LHS lane
12333 LHSMask |= LHSUsedLanes & 0x04040404;
12334 // Combine masks
12335 uint32_t Sel = LHSMask | RHSMask;
12336 SDLoc DL(N);
12337
12338 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32,
12339 LHS.getOperand(0), RHS.getOperand(0),
12340 DAG.getConstant(Sel, DL, MVT::i32));
12341 }
12342 }
12343 if (LHSMask == ~0u || RHSMask == ~0u) {
12344 if (SDValue Perm = matchPERM(N, DCI))
12345 return Perm;
12346 }
12347 }
12348
12349 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
12350 return SDValue();
12351
12352 // TODO: This could be a generic combine with a predicate for extracting the
12353 // high half of an integer being free.
12354
12355 // (or i64:x, (zero_extend i32:y)) ->
12356 // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
12357 if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
12358 RHS.getOpcode() != ISD::ZERO_EXTEND)
12359 std::swap(LHS, RHS);
12360
12361 if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
12362 SDValue ExtSrc = RHS.getOperand(0);
12363 EVT SrcVT = ExtSrc.getValueType();
12364 if (SrcVT == MVT::i32) {
12365 SDLoc SL(N);
12366 SDValue LowLHS, HiBits;
12367 std::tie(LowLHS, HiBits) = split64BitValue(LHS, DAG);
12368 SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
12369
12370 DCI.AddToWorklist(LowOr.getNode());
12371 DCI.AddToWorklist(HiBits.getNode());
12372
12373 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
12374 LowOr, HiBits);
12375 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
12376 }
12377 }
12378
12379 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
12380 if (CRHS) {
12381 if (SDValue Split
12382 = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR,
12383 N->getOperand(0), CRHS))
12384 return Split;
12385 }
12386
12387 return SDValue();
12388}
12389
12390SDValue SITargetLowering::performXorCombine(SDNode *N,
12391 DAGCombinerInfo &DCI) const {
12392 if (SDValue RV = reassociateScalarOps(N, DCI.DAG))
12393 return RV;
12394
12395 SDValue LHS = N->getOperand(0);
12396 SDValue RHS = N->getOperand(1);
12397
12398 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
12399 SelectionDAG &DAG = DCI.DAG;
12400
12401 EVT VT = N->getValueType(0);
12402 if (CRHS && VT == MVT::i64) {
12403 if (SDValue Split
12404 = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
12405 return Split;
12406 }
12407
12408 // Make sure to apply the 64-bit constant splitting fold before trying to fold
12409 // fneg-like xors into 64-bit select.
12410 if (LHS.getOpcode() == ISD::SELECT && VT == MVT::i32) {
12411 // This looks like an fneg, try to fold as a source modifier.
12412 if (CRHS && CRHS->getAPIntValue().isSignMask() &&
12413 shouldFoldFNegIntoSrc(N, LHS)) {
12414 // xor (select c, a, b), 0x80000000 ->
12415 // bitcast (select c, (fneg (bitcast a)), (fneg (bitcast b)))
12416 SDLoc DL(N);
12417 SDValue CastLHS =
12418 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(1));
12419 SDValue CastRHS =
12420 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(2));
12421 SDValue FNegLHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastLHS);
12422 SDValue FNegRHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastRHS);
12423 SDValue NewSelect = DAG.getNode(ISD::SELECT, DL, MVT::f32,
12424 LHS->getOperand(0), FNegLHS, FNegRHS);
12425 return DAG.getNode(ISD::BITCAST, DL, VT, NewSelect);
12426 }
12427 }
12428
12429 return SDValue();
12430}
12431
12432SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
12433 DAGCombinerInfo &DCI) const {
12434 if (!Subtarget->has16BitInsts() ||
12435 DCI.getDAGCombineLevel() < AfterLegalizeDAG)
12436 return SDValue();
12437
12438 EVT VT = N->getValueType(0);
12439 if (VT != MVT::i32)
12440 return SDValue();
12441
12442 SDValue Src = N->getOperand(0);
12443 if (Src.getValueType() != MVT::i16)
12444 return SDValue();
12445
12446 return SDValue();
12447}
12448
12449SDValue
12450SITargetLowering::performSignExtendInRegCombine(SDNode *N,
12451 DAGCombinerInfo &DCI) const {
12452 SDValue Src = N->getOperand(0);
12453 auto *VTSign = cast<VTSDNode>(N->getOperand(1));
12454
12455 // Combine s_buffer_load_u8 or s_buffer_load_u16 with sext and replace them
12456 // with s_buffer_load_i8 and s_buffer_load_i16 respectively.
12457 if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE &&
12458 VTSign->getVT() == MVT::i8) ||
12459 (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT &&
12460 VTSign->getVT() == MVT::i16))) {
12461 assert(Subtarget->hasScalarSubwordLoads() &&
12462 "s_buffer_load_{u8, i8} are supported "
12463 "in GFX12 (or newer) architectures.");
12464 EVT VT = Src.getValueType();
12465 unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE)
12468 SDLoc DL(N);
12469 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
12470 SDValue Ops[] = {
12471 Src.getOperand(0), // source register
12472 Src.getOperand(1), // offset
12473 Src.getOperand(2) // cachePolicy
12474 };
12475 auto *M = cast<MemSDNode>(Src);
12476 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
12477 Opc, DL, ResList, Ops, M->getMemoryVT(), M->getMemOperand());
12478 SDValue LoadVal = DCI.DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
12479 return LoadVal;
12480 } else if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
12481 VTSign->getVT() == MVT::i8) ||
12482 (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
12483 VTSign->getVT() == MVT::i16)) &&
12484 Src.hasOneUse()) {
12485 auto *M = cast<MemSDNode>(Src);
12486 SDValue Ops[] = {
12487 Src.getOperand(0), // Chain
12488 Src.getOperand(1), // rsrc
12489 Src.getOperand(2), // vindex
12490 Src.getOperand(3), // voffset
12491 Src.getOperand(4), // soffset
12492 Src.getOperand(5), // offset
12493 Src.getOperand(6),
12494 Src.getOperand(7)
12495 };
12496 // replace with BUFFER_LOAD_BYTE/SHORT
12497 SDVTList ResList = DCI.DAG.getVTList(MVT::i32,
12498 Src.getOperand(0).getValueType());
12499 unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE) ?
12501 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(Opc, SDLoc(N),
12502 ResList,
12503 Ops, M->getMemoryVT(),
12504 M->getMemOperand());
12505 return DCI.DAG.getMergeValues({BufferLoadSignExt,
12506 BufferLoadSignExt.getValue(1)}, SDLoc(N));
12507 }
12508 return SDValue();
12509}
12510
12511SDValue SITargetLowering::performClassCombine(SDNode *N,
12512 DAGCombinerInfo &DCI) const {
12513 SelectionDAG &DAG = DCI.DAG;
12514 SDValue Mask = N->getOperand(1);
12515
12516 // fp_class x, 0 -> false
12517 if (isNullConstant(Mask))
12518 return DAG.getConstant(0, SDLoc(N), MVT::i1);
12519
12520 if (N->getOperand(0).isUndef())
12521 return DAG.getUNDEF(MVT::i1);
12522
12523 return SDValue();
12524}
12525
12526SDValue SITargetLowering::performRcpCombine(SDNode *N,
12527 DAGCombinerInfo &DCI) const {
12528 EVT VT = N->getValueType(0);
12529 SDValue N0 = N->getOperand(0);
12530
12531 if (N0.isUndef()) {
12532 return DCI.DAG.getConstantFP(
12534 VT);
12535 }
12536
12537 if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP ||
12538 N0.getOpcode() == ISD::SINT_TO_FP)) {
12539 return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(N), VT, N0,
12540 N->getFlags());
12541 }
12542
12543 // TODO: Could handle f32 + amdgcn.sqrt but probably never reaches here.
12544 if ((VT == MVT::f16 && N0.getOpcode() == ISD::FSQRT) &&
12545 N->getFlags().hasAllowContract() && N0->getFlags().hasAllowContract()) {
12546 return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(N), VT,
12547 N0.getOperand(0), N->getFlags());
12548 }
12549
12551}
12552
12554 unsigned MaxDepth) const {
12555 unsigned Opcode = Op.getOpcode();
12556 if (Opcode == ISD::FCANONICALIZE)
12557 return true;
12558
12559 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
12560 const auto &F = CFP->getValueAPF();
12561 if (F.isNaN() && F.isSignaling())
12562 return false;
12563 if (!F.isDenormal())
12564 return true;
12565
12566 DenormalMode Mode =
12567 DAG.getMachineFunction().getDenormalMode(F.getSemantics());
12568 return Mode == DenormalMode::getIEEE();
12569 }
12570
12571 // If source is a result of another standard FP operation it is already in
12572 // canonical form.
12573 if (MaxDepth == 0)
12574 return false;
12575
12576 switch (Opcode) {
12577 // These will flush denorms if required.
12578 case ISD::FADD:
12579 case ISD::FSUB:
12580 case ISD::FMUL:
12581 case ISD::FCEIL:
12582 case ISD::FFLOOR:
12583 case ISD::FMA:
12584 case ISD::FMAD:
12585 case ISD::FSQRT:
12586 case ISD::FDIV:
12587 case ISD::FREM:
12588 case ISD::FP_ROUND:
12589 case ISD::FP_EXTEND:
12590 case ISD::FP16_TO_FP:
12591 case ISD::FP_TO_FP16:
12592 case ISD::BF16_TO_FP:
12593 case ISD::FP_TO_BF16:
12594 case ISD::FLDEXP:
12597 case AMDGPUISD::RCP:
12598 case AMDGPUISD::RSQ:
12602 case AMDGPUISD::LOG:
12603 case AMDGPUISD::EXP:
12607 case AMDGPUISD::FRACT:
12614 case AMDGPUISD::SIN_HW:
12615 case AMDGPUISD::COS_HW:
12616 return true;
12617
12618 // It can/will be lowered or combined as a bit operation.
12619 // Need to check their input recursively to handle.
12620 case ISD::FNEG:
12621 case ISD::FABS:
12622 case ISD::FCOPYSIGN:
12623 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12624
12625 case ISD::AND:
12626 if (Op.getValueType() == MVT::i32) {
12627 // Be careful as we only know it is a bitcast floating point type. It
12628 // could be f32, v2f16, we have no way of knowing. Luckily the constant
12629 // value that we optimize for, which comes up in fp32 to bf16 conversions,
12630 // is valid to optimize for all types.
12631 if (auto *RHS = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
12632 if (RHS->getZExtValue() == 0xffff0000) {
12633 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12634 }
12635 }
12636 }
12637 break;
12638
12639 case ISD::FSIN:
12640 case ISD::FCOS:
12641 case ISD::FSINCOS:
12642 return Op.getValueType().getScalarType() != MVT::f16;
12643
12644 case ISD::FMINNUM:
12645 case ISD::FMAXNUM:
12646 case ISD::FMINNUM_IEEE:
12647 case ISD::FMAXNUM_IEEE:
12648 case ISD::FMINIMUM:
12649 case ISD::FMAXIMUM:
12650 case AMDGPUISD::CLAMP:
12651 case AMDGPUISD::FMED3:
12652 case AMDGPUISD::FMAX3:
12653 case AMDGPUISD::FMIN3:
12655 case AMDGPUISD::FMINIMUM3: {
12656 // FIXME: Shouldn't treat the generic operations different based these.
12657 // However, we aren't really required to flush the result from
12658 // minnum/maxnum..
12659
12660 // snans will be quieted, so we only need to worry about denormals.
12661 if (Subtarget->supportsMinMaxDenormModes() ||
12662 // FIXME: denormalsEnabledForType is broken for dynamic
12663 denormalsEnabledForType(DAG, Op.getValueType()))
12664 return true;
12665
12666 // Flushing may be required.
12667 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
12668 // targets need to check their input recursively.
12669
12670 // FIXME: Does this apply with clamp? It's implemented with max.
12671 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
12672 if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1))
12673 return false;
12674 }
12675
12676 return true;
12677 }
12678 case ISD::SELECT: {
12679 return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) &&
12680 isCanonicalized(DAG, Op.getOperand(2), MaxDepth - 1);
12681 }
12682 case ISD::BUILD_VECTOR: {
12683 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
12684 SDValue SrcOp = Op.getOperand(i);
12685 if (!isCanonicalized(DAG, SrcOp, MaxDepth - 1))
12686 return false;
12687 }
12688
12689 return true;
12690 }
12693 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12694 }
12696 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
12697 isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
12698 }
12699 case ISD::UNDEF:
12700 // Could be anything.
12701 return false;
12702
12703 case ISD::BITCAST:
12704 // TODO: This is incorrect as it loses track of the operand's type. We may
12705 // end up effectively bitcasting from f32 to v2f16 or vice versa, and the
12706 // same bits that are canonicalized in one type need not be in the other.
12707 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12708 case ISD::TRUNCATE: {
12709 // Hack round the mess we make when legalizing extract_vector_elt
12710 if (Op.getValueType() == MVT::i16) {
12711 SDValue TruncSrc = Op.getOperand(0);
12712 if (TruncSrc.getValueType() == MVT::i32 &&
12713 TruncSrc.getOpcode() == ISD::BITCAST &&
12714 TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {
12715 return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);
12716 }
12717 }
12718 return false;
12719 }
12721 unsigned IntrinsicID = Op.getConstantOperandVal(0);
12722 // TODO: Handle more intrinsics
12723 switch (IntrinsicID) {
12724 case Intrinsic::amdgcn_cvt_pkrtz:
12725 case Intrinsic::amdgcn_cubeid:
12726 case Intrinsic::amdgcn_frexp_mant:
12727 case Intrinsic::amdgcn_fdot2:
12728 case Intrinsic::amdgcn_rcp:
12729 case Intrinsic::amdgcn_rsq:
12730 case Intrinsic::amdgcn_rsq_clamp:
12731 case Intrinsic::amdgcn_rcp_legacy:
12732 case Intrinsic::amdgcn_rsq_legacy:
12733 case Intrinsic::amdgcn_trig_preop:
12734 case Intrinsic::amdgcn_log:
12735 case Intrinsic::amdgcn_exp2:
12736 case Intrinsic::amdgcn_sqrt:
12737 return true;
12738 default:
12739 break;
12740 }
12741
12742 break;
12743 }
12744 default:
12745 break;
12746 }
12747
12748 // FIXME: denormalsEnabledForType is broken for dynamic
12749 return denormalsEnabledForType(DAG, Op.getValueType()) &&
12750 DAG.isKnownNeverSNaN(Op);
12751}
12752
12754 unsigned MaxDepth) const {
12755 const MachineRegisterInfo &MRI = MF.getRegInfo();
12756 MachineInstr *MI = MRI.getVRegDef(Reg);
12757 unsigned Opcode = MI->getOpcode();
12758
12759 if (Opcode == AMDGPU::G_FCANONICALIZE)
12760 return true;
12761
12762 std::optional<FPValueAndVReg> FCR;
12763 // Constant splat (can be padded with undef) or scalar constant.
12764 if (mi_match(Reg, MRI, MIPatternMatch::m_GFCstOrSplat(FCR))) {
12765 if (FCR->Value.isSignaling())
12766 return false;
12767 if (!FCR->Value.isDenormal())
12768 return true;
12769
12770 DenormalMode Mode = MF.getDenormalMode(FCR->Value.getSemantics());
12771 return Mode == DenormalMode::getIEEE();
12772 }
12773
12774 if (MaxDepth == 0)
12775 return false;
12776
12777 switch (Opcode) {
12778 case AMDGPU::G_FADD:
12779 case AMDGPU::G_FSUB:
12780 case AMDGPU::G_FMUL:
12781 case AMDGPU::G_FCEIL:
12782 case AMDGPU::G_FFLOOR:
12783 case AMDGPU::G_FRINT:
12784 case AMDGPU::G_FNEARBYINT:
12785 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
12786 case AMDGPU::G_INTRINSIC_TRUNC:
12787 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
12788 case AMDGPU::G_FMA:
12789 case AMDGPU::G_FMAD:
12790 case AMDGPU::G_FSQRT:
12791 case AMDGPU::G_FDIV:
12792 case AMDGPU::G_FREM:
12793 case AMDGPU::G_FPOW:
12794 case AMDGPU::G_FPEXT:
12795 case AMDGPU::G_FLOG:
12796 case AMDGPU::G_FLOG2:
12797 case AMDGPU::G_FLOG10:
12798 case AMDGPU::G_FPTRUNC:
12799 case AMDGPU::G_AMDGPU_RCP_IFLAG:
12800 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
12801 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
12802 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
12803 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
12804 return true;
12805 case AMDGPU::G_FNEG:
12806 case AMDGPU::G_FABS:
12807 case AMDGPU::G_FCOPYSIGN:
12808 return isCanonicalized(MI->getOperand(1).getReg(), MF, MaxDepth - 1);
12809 case AMDGPU::G_FMINNUM:
12810 case AMDGPU::G_FMAXNUM:
12811 case AMDGPU::G_FMINNUM_IEEE:
12812 case AMDGPU::G_FMAXNUM_IEEE:
12813 case AMDGPU::G_FMINIMUM:
12814 case AMDGPU::G_FMAXIMUM: {
12815 if (Subtarget->supportsMinMaxDenormModes() ||
12816 // FIXME: denormalsEnabledForType is broken for dynamic
12817 denormalsEnabledForType(MRI.getType(Reg), MF))
12818 return true;
12819
12820 [[fallthrough]];
12821 }
12822 case AMDGPU::G_BUILD_VECTOR:
12823 for (const MachineOperand &MO : llvm::drop_begin(MI->operands()))
12824 if (!isCanonicalized(MO.getReg(), MF, MaxDepth - 1))
12825 return false;
12826 return true;
12827 case AMDGPU::G_INTRINSIC:
12828 case AMDGPU::G_INTRINSIC_CONVERGENT:
12829 switch (cast<GIntrinsic>(MI)->getIntrinsicID()) {
12830 case Intrinsic::amdgcn_fmul_legacy:
12831 case Intrinsic::amdgcn_fmad_ftz:
12832 case Intrinsic::amdgcn_sqrt:
12833 case Intrinsic::amdgcn_fmed3:
12834 case Intrinsic::amdgcn_sin:
12835 case Intrinsic::amdgcn_cos:
12836 case Intrinsic::amdgcn_log:
12837 case Intrinsic::amdgcn_exp2:
12838 case Intrinsic::amdgcn_log_clamp:
12839 case Intrinsic::amdgcn_rcp:
12840 case Intrinsic::amdgcn_rcp_legacy:
12841 case Intrinsic::amdgcn_rsq:
12842 case Intrinsic::amdgcn_rsq_clamp:
12843 case Intrinsic::amdgcn_rsq_legacy:
12844 case Intrinsic::amdgcn_div_scale:
12845 case Intrinsic::amdgcn_div_fmas:
12846 case Intrinsic::amdgcn_div_fixup:
12847 case Intrinsic::amdgcn_fract:
12848 case Intrinsic::amdgcn_cvt_pkrtz:
12849 case Intrinsic::amdgcn_cubeid:
12850 case Intrinsic::amdgcn_cubema:
12851 case Intrinsic::amdgcn_cubesc:
12852 case Intrinsic::amdgcn_cubetc:
12853 case Intrinsic::amdgcn_frexp_mant:
12854 case Intrinsic::amdgcn_fdot2:
12855 case Intrinsic::amdgcn_trig_preop:
12856 return true;
12857 default:
12858 break;
12859 }
12860
12861 [[fallthrough]];
12862 default:
12863 return false;
12864 }
12865
12866 llvm_unreachable("invalid operation");
12867}
12868
12869// Constant fold canonicalize.
12870SDValue SITargetLowering::getCanonicalConstantFP(
12871 SelectionDAG &DAG, const SDLoc &SL, EVT VT, const APFloat &C) const {
12872 // Flush denormals to 0 if not enabled.
12873 if (C.isDenormal()) {
12874 DenormalMode Mode =
12875 DAG.getMachineFunction().getDenormalMode(C.getSemantics());
12876 if (Mode == DenormalMode::getPreserveSign()) {
12877 return DAG.getConstantFP(
12878 APFloat::getZero(C.getSemantics(), C.isNegative()), SL, VT);
12879 }
12880
12881 if (Mode != DenormalMode::getIEEE())
12882 return SDValue();
12883 }
12884
12885 if (C.isNaN()) {
12886 APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
12887 if (C.isSignaling()) {
12888 // Quiet a signaling NaN.
12889 // FIXME: Is this supposed to preserve payload bits?
12890 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
12891 }
12892
12893 // Make sure it is the canonical NaN bitpattern.
12894 //
12895 // TODO: Can we use -1 as the canonical NaN value since it's an inline
12896 // immediate?
12897 if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
12898 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
12899 }
12900
12901 // Already canonical.
12902 return DAG.getConstantFP(C, SL, VT);
12903}
12904
12906 return Op.isUndef() || isa<ConstantFPSDNode>(Op);
12907}
12908
12909SDValue SITargetLowering::performFCanonicalizeCombine(
12910 SDNode *N,
12911 DAGCombinerInfo &DCI) const {
12912 SelectionDAG &DAG = DCI.DAG;
12913 SDValue N0 = N->getOperand(0);
12914 EVT VT = N->getValueType(0);
12915
12916 // fcanonicalize undef -> qnan
12917 if (N0.isUndef()) {
12919 return DAG.getConstantFP(QNaN, SDLoc(N), VT);
12920 }
12921
12922 if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N0)) {
12923 EVT VT = N->getValueType(0);
12924 return getCanonicalConstantFP(DAG, SDLoc(N), VT, CFP->getValueAPF());
12925 }
12926
12927 // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
12928 // (fcanonicalize k)
12929 //
12930 // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
12931
12932 // TODO: This could be better with wider vectors that will be split to v2f16,
12933 // and to consider uses since there aren't that many packed operations.
12934 if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
12935 isTypeLegal(MVT::v2f16)) {
12936 SDLoc SL(N);
12937 SDValue NewElts[2];
12938 SDValue Lo = N0.getOperand(0);
12939 SDValue Hi = N0.getOperand(1);
12940 EVT EltVT = Lo.getValueType();
12941
12943 for (unsigned I = 0; I != 2; ++I) {
12944 SDValue Op = N0.getOperand(I);
12945 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
12946 NewElts[I] = getCanonicalConstantFP(DAG, SL, EltVT,
12947 CFP->getValueAPF());
12948 } else if (Op.isUndef()) {
12949 // Handled below based on what the other operand is.
12950 NewElts[I] = Op;
12951 } else {
12952 NewElts[I] = DAG.getNode(ISD::FCANONICALIZE, SL, EltVT, Op);
12953 }
12954 }
12955
12956 // If one half is undef, and one is constant, prefer a splat vector rather
12957 // than the normal qNaN. If it's a register, prefer 0.0 since that's
12958 // cheaper to use and may be free with a packed operation.
12959 if (NewElts[0].isUndef()) {
12960 if (isa<ConstantFPSDNode>(NewElts[1]))
12961 NewElts[0] = isa<ConstantFPSDNode>(NewElts[1]) ?
12962 NewElts[1]: DAG.getConstantFP(0.0f, SL, EltVT);
12963 }
12964
12965 if (NewElts[1].isUndef()) {
12966 NewElts[1] = isa<ConstantFPSDNode>(NewElts[0]) ?
12967 NewElts[0] : DAG.getConstantFP(0.0f, SL, EltVT);
12968 }
12969
12970 return DAG.getBuildVector(VT, SL, NewElts);
12971 }
12972 }
12973
12974 return SDValue();
12975}
12976
12977static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
12978 switch (Opc) {
12979 case ISD::FMAXNUM:
12980 case ISD::FMAXNUM_IEEE:
12981 return AMDGPUISD::FMAX3;
12982 case ISD::FMAXIMUM:
12983 return AMDGPUISD::FMAXIMUM3;
12984 case ISD::SMAX:
12985 return AMDGPUISD::SMAX3;
12986 case ISD::UMAX:
12987 return AMDGPUISD::UMAX3;
12988 case ISD::FMINNUM:
12989 case ISD::FMINNUM_IEEE:
12990 return AMDGPUISD::FMIN3;
12991 case ISD::FMINIMUM:
12992 return AMDGPUISD::FMINIMUM3;
12993 case ISD::SMIN:
12994 return AMDGPUISD::SMIN3;
12995 case ISD::UMIN:
12996 return AMDGPUISD::UMIN3;
12997 default:
12998 llvm_unreachable("Not a min/max opcode");
12999 }
13000}
13001
13002SDValue SITargetLowering::performIntMed3ImmCombine(SelectionDAG &DAG,
13003 const SDLoc &SL, SDValue Src,
13004 SDValue MinVal,
13005 SDValue MaxVal,
13006 bool Signed) const {
13007
13008 // med3 comes from
13009 // min(max(x, K0), K1), K0 < K1
13010 // max(min(x, K0), K1), K1 < K0
13011 //
13012 // "MinVal" and "MaxVal" respectively refer to the rhs of the
13013 // min/max op.
13014 ConstantSDNode *MinK = dyn_cast<ConstantSDNode>(MinVal);
13015 ConstantSDNode *MaxK = dyn_cast<ConstantSDNode>(MaxVal);
13016
13017 if (!MinK || !MaxK)
13018 return SDValue();
13019
13020 if (Signed) {
13021 if (MaxK->getAPIntValue().sge(MinK->getAPIntValue()))
13022 return SDValue();
13023 } else {
13024 if (MaxK->getAPIntValue().uge(MinK->getAPIntValue()))
13025 return SDValue();
13026 }
13027
13028 EVT VT = MinK->getValueType(0);
13029 unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
13030 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
13031 return DAG.getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
13032
13033 // Note: we could also extend to i32 and use i32 med3 if i16 med3 is
13034 // not available, but this is unlikely to be profitable as constants
13035 // will often need to be materialized & extended, especially on
13036 // pre-GFX10 where VOP3 instructions couldn't take literal operands.
13037 return SDValue();
13038}
13039
13041 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op))
13042 return C;
13043
13044 if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op)) {
13045 if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
13046 return C;
13047 }
13048
13049 return nullptr;
13050}
13051
13052SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
13053 const SDLoc &SL,
13054 SDValue Op0,
13055 SDValue Op1) const {
13057 if (!K1)
13058 return SDValue();
13059
13061 if (!K0)
13062 return SDValue();
13063
13064 // Ordered >= (although NaN inputs should have folded away by now).
13065 if (K0->getValueAPF() > K1->getValueAPF())
13066 return SDValue();
13067
13068 const MachineFunction &MF = DAG.getMachineFunction();
13070
13071 // TODO: Check IEEE bit enabled?
13072 EVT VT = Op0.getValueType();
13073 if (Info->getMode().DX10Clamp) {
13074 // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
13075 // hardware fmed3 behavior converting to a min.
13076 // FIXME: Should this be allowing -0.0?
13077 if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0))
13078 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
13079 }
13080
13081 // med3 for f16 is only available on gfx9+, and not available for v2f16.
13082 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
13083 // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
13084 // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
13085 // then give the other result, which is different from med3 with a NaN
13086 // input.
13087 SDValue Var = Op0.getOperand(0);
13088 if (!DAG.isKnownNeverSNaN(Var))
13089 return SDValue();
13090
13092
13093 if ((!K0->hasOneUse() || TII->isInlineConstant(K0->getValueAPF())) &&
13094 (!K1->hasOneUse() || TII->isInlineConstant(K1->getValueAPF()))) {
13095 return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0),
13096 Var, SDValue(K0, 0), SDValue(K1, 0));
13097 }
13098 }
13099
13100 return SDValue();
13101}
13102
13103SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
13104 DAGCombinerInfo &DCI) const {
13105 SelectionDAG &DAG = DCI.DAG;
13106
13107 EVT VT = N->getValueType(0);
13108 unsigned Opc = N->getOpcode();
13109 SDValue Op0 = N->getOperand(0);
13110 SDValue Op1 = N->getOperand(1);
13111
13112 // Only do this if the inner op has one use since this will just increases
13113 // register pressure for no benefit.
13114
13115 if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY &&
13116 !VT.isVector() &&
13117 (VT == MVT::i32 || VT == MVT::f32 ||
13118 ((VT == MVT::f16 || VT == MVT::i16) && Subtarget->hasMin3Max3_16()))) {
13119 // max(max(a, b), c) -> max3(a, b, c)
13120 // min(min(a, b), c) -> min3(a, b, c)
13121 if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
13122 SDLoc DL(N);
13123 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
13124 DL,
13125 N->getValueType(0),
13126 Op0.getOperand(0),
13127 Op0.getOperand(1),
13128 Op1);
13129 }
13130
13131 // Try commuted.
13132 // max(a, max(b, c)) -> max3(a, b, c)
13133 // min(a, min(b, c)) -> min3(a, b, c)
13134 if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
13135 SDLoc DL(N);
13136 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
13137 DL,
13138 N->getValueType(0),
13139 Op0,
13140 Op1.getOperand(0),
13141 Op1.getOperand(1));
13142 }
13143 }
13144
13145 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
13146 // max(min(x, K0), K1), K1 < K0 -> med3(x, K1, K0)
13147 if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
13148 if (SDValue Med3 = performIntMed3ImmCombine(
13149 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), true))
13150 return Med3;
13151 }
13152 if (Opc == ISD::SMAX && Op0.getOpcode() == ISD::SMIN && Op0.hasOneUse()) {
13153 if (SDValue Med3 = performIntMed3ImmCombine(
13154 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, true))
13155 return Med3;
13156 }
13157
13158 if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
13159 if (SDValue Med3 = performIntMed3ImmCombine(
13160 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), false))
13161 return Med3;
13162 }
13163 if (Opc == ISD::UMAX && Op0.getOpcode() == ISD::UMIN && Op0.hasOneUse()) {
13164 if (SDValue Med3 = performIntMed3ImmCombine(
13165 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, false))
13166 return Med3;
13167 }
13168
13169 // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1)
13170 if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
13171 (Opc == ISD::FMINNUM_IEEE && Op0.getOpcode() == ISD::FMAXNUM_IEEE) ||
13172 (Opc == AMDGPUISD::FMIN_LEGACY &&
13173 Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
13174 (VT == MVT::f32 || VT == MVT::f64 ||
13175 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
13176 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
13177 Op0.hasOneUse()) {
13178 if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
13179 return Res;
13180 }
13181
13182 return SDValue();
13183}
13184
13186 if (ConstantFPSDNode *CA = dyn_cast<ConstantFPSDNode>(A)) {
13187 if (ConstantFPSDNode *CB = dyn_cast<ConstantFPSDNode>(B)) {
13188 // FIXME: Should this be allowing -0.0?
13189 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
13190 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
13191 }
13192 }
13193
13194 return false;
13195}
13196
13197// FIXME: Should only worry about snans for version with chain.
13198SDValue SITargetLowering::performFMed3Combine(SDNode *N,
13199 DAGCombinerInfo &DCI) const {
13200 EVT VT = N->getValueType(0);
13201 // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
13202 // NaNs. With a NaN input, the order of the operands may change the result.
13203
13204 SelectionDAG &DAG = DCI.DAG;
13205 SDLoc SL(N);
13206
13207 SDValue Src0 = N->getOperand(0);
13208 SDValue Src1 = N->getOperand(1);
13209 SDValue Src2 = N->getOperand(2);
13210
13211 if (isClampZeroToOne(Src0, Src1)) {
13212 // const_a, const_b, x -> clamp is safe in all cases including signaling
13213 // nans.
13214 // FIXME: Should this be allowing -0.0?
13215 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
13216 }
13217
13218 const MachineFunction &MF = DAG.getMachineFunction();
13220
13221 // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
13222 // handling no dx10-clamp?
13223 if (Info->getMode().DX10Clamp) {
13224 // If NaNs is clamped to 0, we are free to reorder the inputs.
13225
13226 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
13227 std::swap(Src0, Src1);
13228
13229 if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
13230 std::swap(Src1, Src2);
13231
13232 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
13233 std::swap(Src0, Src1);
13234
13235 if (isClampZeroToOne(Src1, Src2))
13236 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
13237 }
13238
13239 return SDValue();
13240}
13241
13242SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
13243 DAGCombinerInfo &DCI) const {
13244 SDValue Src0 = N->getOperand(0);
13245 SDValue Src1 = N->getOperand(1);
13246 if (Src0.isUndef() && Src1.isUndef())
13247 return DCI.DAG.getUNDEF(N->getValueType(0));
13248 return SDValue();
13249}
13250
13251// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
13252// expanded into a set of cmp/select instructions.
13254 unsigned NumElem,
13255 bool IsDivergentIdx,
13256 const GCNSubtarget *Subtarget) {
13258 return false;
13259
13260 unsigned VecSize = EltSize * NumElem;
13261
13262 // Sub-dword vectors of size 2 dword or less have better implementation.
13263 if (VecSize <= 64 && EltSize < 32)
13264 return false;
13265
13266 // Always expand the rest of sub-dword instructions, otherwise it will be
13267 // lowered via memory.
13268 if (EltSize < 32)
13269 return true;
13270
13271 // Always do this if var-idx is divergent, otherwise it will become a loop.
13272 if (IsDivergentIdx)
13273 return true;
13274
13275 // Large vectors would yield too many compares and v_cndmask_b32 instructions.
13276 unsigned NumInsts = NumElem /* Number of compares */ +
13277 ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */;
13278
13279 // On some architectures (GFX9) movrel is not available and it's better
13280 // to expand.
13281 if (!Subtarget->hasMovrel())
13282 return NumInsts <= 16;
13283
13284 // If movrel is available, use it instead of expanding for vector of 8
13285 // elements.
13286 return NumInsts <= 15;
13287}
13288
13290 SDValue Idx = N->getOperand(N->getNumOperands() - 1);
13291 if (isa<ConstantSDNode>(Idx))
13292 return false;
13293
13294 SDValue Vec = N->getOperand(0);
13295 EVT VecVT = Vec.getValueType();
13296 EVT EltVT = VecVT.getVectorElementType();
13297 unsigned EltSize = EltVT.getSizeInBits();
13298 unsigned NumElem = VecVT.getVectorNumElements();
13299
13301 EltSize, NumElem, Idx->isDivergent(), getSubtarget());
13302}
13303
13304SDValue SITargetLowering::performExtractVectorEltCombine(
13305 SDNode *N, DAGCombinerInfo &DCI) const {
13306 SDValue Vec = N->getOperand(0);
13307 SelectionDAG &DAG = DCI.DAG;
13308
13309 EVT VecVT = Vec.getValueType();
13310 EVT VecEltVT = VecVT.getVectorElementType();
13311 EVT ResVT = N->getValueType(0);
13312
13313 unsigned VecSize = VecVT.getSizeInBits();
13314 unsigned VecEltSize = VecEltVT.getSizeInBits();
13315
13316 if ((Vec.getOpcode() == ISD::FNEG ||
13318 SDLoc SL(N);
13319 SDValue Idx = N->getOperand(1);
13320 SDValue Elt =
13321 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec.getOperand(0), Idx);
13322 return DAG.getNode(Vec.getOpcode(), SL, ResVT, Elt);
13323 }
13324
13325 // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
13326 // =>
13327 // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
13328 // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
13329 // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
13330 if (Vec.hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
13331 SDLoc SL(N);
13332 SDValue Idx = N->getOperand(1);
13333 unsigned Opc = Vec.getOpcode();
13334
13335 switch(Opc) {
13336 default:
13337 break;
13338 // TODO: Support other binary operations.
13339 case ISD::FADD:
13340 case ISD::FSUB:
13341 case ISD::FMUL:
13342 case ISD::ADD:
13343 case ISD::UMIN:
13344 case ISD::UMAX:
13345 case ISD::SMIN:
13346 case ISD::SMAX:
13347 case ISD::FMAXNUM:
13348 case ISD::FMINNUM:
13349 case ISD::FMAXNUM_IEEE:
13350 case ISD::FMINNUM_IEEE:
13351 case ISD::FMAXIMUM:
13352 case ISD::FMINIMUM: {
13353 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
13354 Vec.getOperand(0), Idx);
13355 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
13356 Vec.getOperand(1), Idx);
13357
13358 DCI.AddToWorklist(Elt0.getNode());
13359 DCI.AddToWorklist(Elt1.getNode());
13360 return DAG.getNode(Opc, SL, ResVT, Elt0, Elt1, Vec->getFlags());
13361 }
13362 }
13363 }
13364
13365 // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
13367 SDLoc SL(N);
13368 SDValue Idx = N->getOperand(1);
13369 SDValue V;
13370 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
13371 SDValue IC = DAG.getVectorIdxConstant(I, SL);
13372 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec, IC);
13373 if (I == 0)
13374 V = Elt;
13375 else
13376 V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ);
13377 }
13378 return V;
13379 }
13380
13381 if (!DCI.isBeforeLegalize())
13382 return SDValue();
13383
13384 // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
13385 // elements. This exposes more load reduction opportunities by replacing
13386 // multiple small extract_vector_elements with a single 32-bit extract.
13387 auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
13388 if (isa<MemSDNode>(Vec) && VecEltSize <= 16 && VecEltVT.isByteSized() &&
13389 VecSize > 32 && VecSize % 32 == 0 && Idx) {
13390 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT);
13391
13392 unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
13393 unsigned EltIdx = BitIndex / 32;
13394 unsigned LeftoverBitIdx = BitIndex % 32;
13395 SDLoc SL(N);
13396
13397 SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec);
13398 DCI.AddToWorklist(Cast.getNode());
13399
13400 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast,
13401 DAG.getConstant(EltIdx, SL, MVT::i32));
13402 DCI.AddToWorklist(Elt.getNode());
13403 SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt,
13404 DAG.getConstant(LeftoverBitIdx, SL, MVT::i32));
13405 DCI.AddToWorklist(Srl.getNode());
13406
13407 EVT VecEltAsIntVT = VecEltVT.changeTypeToInteger();
13408 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VecEltAsIntVT, Srl);
13409 DCI.AddToWorklist(Trunc.getNode());
13410
13411 if (VecEltVT == ResVT) {
13412 return DAG.getNode(ISD::BITCAST, SL, VecEltVT, Trunc);
13413 }
13414
13415 assert(ResVT.isScalarInteger());
13416 return DAG.getAnyExtOrTrunc(Trunc, SL, ResVT);
13417 }
13418
13419 return SDValue();
13420}
13421
13422SDValue
13423SITargetLowering::performInsertVectorEltCombine(SDNode *N,
13424 DAGCombinerInfo &DCI) const {
13425 SDValue Vec = N->getOperand(0);
13426 SDValue Idx = N->getOperand(2);
13427 EVT VecVT = Vec.getValueType();
13428 EVT EltVT = VecVT.getVectorElementType();
13429
13430 // INSERT_VECTOR_ELT (<n x e>, var-idx)
13431 // => BUILD_VECTOR n x select (e, const-idx)
13433 return SDValue();
13434
13435 SelectionDAG &DAG = DCI.DAG;
13436 SDLoc SL(N);
13437 SDValue Ins = N->getOperand(1);
13438 EVT IdxVT = Idx.getValueType();
13439
13441 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
13442 SDValue IC = DAG.getConstant(I, SL, IdxVT);
13443 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
13444 SDValue V = DAG.getSelectCC(SL, Idx, IC, Ins, Elt, ISD::SETEQ);
13445 Ops.push_back(V);
13446 }
13447
13448 return DAG.getBuildVector(VecVT, SL, Ops);
13449}
13450
13451/// Return the source of an fp_extend from f16 to f32, or a converted FP
13452/// constant.
13454 if (Src.getOpcode() == ISD::FP_EXTEND &&
13455 Src.getOperand(0).getValueType() == MVT::f16) {
13456 return Src.getOperand(0);
13457 }
13458
13459 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Src)) {
13460 APFloat Val = CFP->getValueAPF();
13461 bool LosesInfo = true;
13463 if (!LosesInfo)
13464 return DAG.getConstantFP(Val, SDLoc(Src), MVT::f16);
13465 }
13466
13467 return SDValue();
13468}
13469
13470SDValue SITargetLowering::performFPRoundCombine(SDNode *N,
13471 DAGCombinerInfo &DCI) const {
13472 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
13473 "combine only useful on gfx8");
13474
13475 SDValue TruncSrc = N->getOperand(0);
13476 EVT VT = N->getValueType(0);
13477 if (VT != MVT::f16)
13478 return SDValue();
13479
13480 if (TruncSrc.getOpcode() != AMDGPUISD::FMED3 ||
13481 TruncSrc.getValueType() != MVT::f32 || !TruncSrc.hasOneUse())
13482 return SDValue();
13483
13484 SelectionDAG &DAG = DCI.DAG;
13485 SDLoc SL(N);
13486
13487 // Optimize f16 fmed3 pattern performed on f32. On gfx8 there is no f16 fmed3,
13488 // and expanding it with min/max saves 1 instruction vs. casting to f32 and
13489 // casting back.
13490
13491 // fptrunc (f32 (fmed3 (fpext f16:a, fpext f16:b, fpext f16:c))) =>
13492 // fmin(fmax(a, b), fmax(fmin(a, b), c))
13493 SDValue A = strictFPExtFromF16(DAG, TruncSrc.getOperand(0));
13494 if (!A)
13495 return SDValue();
13496
13497 SDValue B = strictFPExtFromF16(DAG, TruncSrc.getOperand(1));
13498 if (!B)
13499 return SDValue();
13500
13501 SDValue C = strictFPExtFromF16(DAG, TruncSrc.getOperand(2));
13502 if (!C)
13503 return SDValue();
13504
13505 // This changes signaling nan behavior. If an input is a signaling nan, it
13506 // would have been quieted by the fpext originally. We don't care because
13507 // these are unconstrained ops. If we needed to insert quieting canonicalizes
13508 // we would be worse off than just doing the promotion.
13509 SDValue A1 = DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, A, B);
13510 SDValue B1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A, B);
13511 SDValue C1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A1, C);
13512 return DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, B1, C1);
13513}
13514
13515unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
13516 const SDNode *N0,
13517 const SDNode *N1) const {
13518 EVT VT = N0->getValueType(0);
13519
13520 // Only do this if we are not trying to support denormals. v_mad_f32 does not
13521 // support denormals ever.
13522 if (((VT == MVT::f32 &&
13524 (VT == MVT::f16 && Subtarget->hasMadF16() &&
13527 return ISD::FMAD;
13528
13529 const TargetOptions &Options = DAG.getTarget().Options;
13530 if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
13531 (N0->getFlags().hasAllowContract() &&
13532 N1->getFlags().hasAllowContract())) &&
13534 return ISD::FMA;
13535 }
13536
13537 return 0;
13538}
13539
13540// For a reassociatable opcode perform:
13541// op x, (op y, z) -> op (op x, z), y, if x and z are uniform
13542SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
13543 SelectionDAG &DAG) const {
13544 EVT VT = N->getValueType(0);
13545 if (VT != MVT::i32 && VT != MVT::i64)
13546 return SDValue();
13547
13548 if (DAG.isBaseWithConstantOffset(SDValue(N, 0)))
13549 return SDValue();
13550
13551 unsigned Opc = N->getOpcode();
13552 SDValue Op0 = N->getOperand(0);
13553 SDValue Op1 = N->getOperand(1);
13554
13555 if (!(Op0->isDivergent() ^ Op1->isDivergent()))
13556 return SDValue();
13557
13558 if (Op0->isDivergent())
13559 std::swap(Op0, Op1);
13560
13561 if (Op1.getOpcode() != Opc || !Op1.hasOneUse())
13562 return SDValue();
13563
13564 SDValue Op2 = Op1.getOperand(1);
13565 Op1 = Op1.getOperand(0);
13566 if (!(Op1->isDivergent() ^ Op2->isDivergent()))
13567 return SDValue();
13568
13569 if (Op1->isDivergent())
13570 std::swap(Op1, Op2);
13571
13572 SDLoc SL(N);
13573 SDValue Add1 = DAG.getNode(Opc, SL, VT, Op0, Op1);
13574 return DAG.getNode(Opc, SL, VT, Add1, Op2);
13575}
13576
13577static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL,
13578 EVT VT,
13579 SDValue N0, SDValue N1, SDValue N2,
13580 bool Signed) {
13582 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1);
13583 SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2);
13584 return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
13585}
13586
13587// Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high
13588// multiplies, if any.
13589//
13590// Full 64-bit multiplies that feed into an addition are lowered here instead
13591// of using the generic expansion. The generic expansion ends up with
13592// a tree of ADD nodes that prevents us from using the "add" part of the
13593// MAD instruction. The expansion produced here results in a chain of ADDs
13594// instead of a tree.
13595SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
13596 DAGCombinerInfo &DCI) const {
13597 assert(N->getOpcode() == ISD::ADD);
13598
13599 SelectionDAG &DAG = DCI.DAG;
13600 EVT VT = N->getValueType(0);
13601 SDLoc SL(N);
13602 SDValue LHS = N->getOperand(0);
13603 SDValue RHS = N->getOperand(1);
13604
13605 if (VT.isVector())
13606 return SDValue();
13607
13608 // S_MUL_HI_[IU]32 was added in gfx9, which allows us to keep the overall
13609 // result in scalar registers for uniform values.
13610 if (!N->isDivergent() && Subtarget->hasSMulHi())
13611 return SDValue();
13612
13613 unsigned NumBits = VT.getScalarSizeInBits();
13614 if (NumBits <= 32 || NumBits > 64)
13615 return SDValue();
13616
13617 if (LHS.getOpcode() != ISD::MUL) {
13618 assert(RHS.getOpcode() == ISD::MUL);
13619 std::swap(LHS, RHS);
13620 }
13621
13622 // Avoid the fold if it would unduly increase the number of multiplies due to
13623 // multiple uses, except on hardware with full-rate multiply-add (which is
13624 // part of full-rate 64-bit ops).
13625 if (!Subtarget->hasFullRate64Ops()) {
13626 unsigned NumUsers = 0;
13627 for (SDNode *Use : LHS->uses()) {
13628 // There is a use that does not feed into addition, so the multiply can't
13629 // be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
13630 if (Use->getOpcode() != ISD::ADD)
13631 return SDValue();
13632
13633 // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
13634 // MUL + 3xADD + 3xADDC over 3xMAD.
13635 ++NumUsers;
13636 if (NumUsers >= 3)
13637 return SDValue();
13638 }
13639 }
13640
13641 SDValue MulLHS = LHS.getOperand(0);
13642 SDValue MulRHS = LHS.getOperand(1);
13643 SDValue AddRHS = RHS;
13644
13645 // Always check whether operands are small unsigned values, since that
13646 // knowledge is useful in more cases. Check for small signed values only if
13647 // doing so can unlock a shorter code sequence.
13648 bool MulLHSUnsigned32 = numBitsUnsigned(MulLHS, DAG) <= 32;
13649 bool MulRHSUnsigned32 = numBitsUnsigned(MulRHS, DAG) <= 32;
13650
13651 bool MulSignedLo = false;
13652 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
13653 MulSignedLo = numBitsSigned(MulLHS, DAG) <= 32 &&
13654 numBitsSigned(MulRHS, DAG) <= 32;
13655 }
13656
13657 // The operands and final result all have the same number of bits. If
13658 // operands need to be extended, they can be extended with garbage. The
13659 // resulting garbage in the high bits of the mad_[iu]64_[iu]32 result is
13660 // truncated away in the end.
13661 if (VT != MVT::i64) {
13662 MulLHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulLHS);
13663 MulRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulRHS);
13664 AddRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, AddRHS);
13665 }
13666
13667 // The basic code generated is conceptually straightforward. Pseudo code:
13668 //
13669 // accum = mad_64_32 lhs.lo, rhs.lo, accum
13670 // accum.hi = add (mul lhs.hi, rhs.lo), accum.hi
13671 // accum.hi = add (mul lhs.lo, rhs.hi), accum.hi
13672 //
13673 // The second and third lines are optional, depending on whether the factors
13674 // are {sign,zero}-extended or not.
13675 //
13676 // The actual DAG is noisier than the pseudo code, but only due to
13677 // instructions that disassemble values into low and high parts, and
13678 // assemble the final result.
13679 SDValue One = DAG.getConstant(1, SL, MVT::i32);
13680
13681 auto MulLHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS);
13682 auto MulRHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulRHS);
13683 SDValue Accum =
13684 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
13685
13686 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
13687 SDValue AccumLo, AccumHi;
13688 std::tie(AccumLo, AccumHi) = DAG.SplitScalar(Accum, SL, MVT::i32, MVT::i32);
13689
13690 if (!MulLHSUnsigned32) {
13691 auto MulLHSHi =
13692 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulLHS, One);
13693 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSHi, MulRHSLo);
13694 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
13695 }
13696
13697 if (!MulRHSUnsigned32) {
13698 auto MulRHSHi =
13699 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulRHS, One);
13700 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSLo, MulRHSHi);
13701 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
13702 }
13703
13704 Accum = DAG.getBuildVector(MVT::v2i32, SL, {AccumLo, AccumHi});
13705 Accum = DAG.getBitcast(MVT::i64, Accum);
13706 }
13707
13708 if (VT != MVT::i64)
13709 Accum = DAG.getNode(ISD::TRUNCATE, SL, VT, Accum);
13710 return Accum;
13711}
13712
13713// Collect the ultimate src of each of the mul node's operands, and confirm
13714// each operand is 8 bytes.
13715static std::optional<ByteProvider<SDValue>>
13716handleMulOperand(const SDValue &MulOperand) {
13717 auto Byte0 = calculateByteProvider(MulOperand, 0, 0);
13718 if (!Byte0 || Byte0->isConstantZero()) {
13719 return std::nullopt;
13720 }
13721 auto Byte1 = calculateByteProvider(MulOperand, 1, 0);
13722 if (Byte1 && !Byte1->isConstantZero()) {
13723 return std::nullopt;
13724 }
13725 return Byte0;
13726}
13727
13728static unsigned addPermMasks(unsigned First, unsigned Second) {
13729 unsigned FirstCs = First & 0x0c0c0c0c;
13730 unsigned SecondCs = Second & 0x0c0c0c0c;
13731 unsigned FirstNoCs = First & ~0x0c0c0c0c;
13732 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
13733
13734 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
13735 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
13736 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
13737 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
13738
13739 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
13740}
13741
13742struct DotSrc {
13744 int64_t PermMask;
13746};
13747
13751 SmallVectorImpl<DotSrc> &Src1s, int Step) {
13752
13753 assert(Src0.Src.has_value() && Src1.Src.has_value());
13754 // Src0s and Src1s are empty, just place arbitrarily.
13755 if (Step == 0) {
13756 Src0s.push_back({*Src0.Src, ((Src0.SrcOffset % 4) << 24) + 0x0c0c0c,
13757 Src0.SrcOffset / 4});
13758 Src1s.push_back({*Src1.Src, ((Src1.SrcOffset % 4) << 24) + 0x0c0c0c,
13759 Src1.SrcOffset / 4});
13760 return;
13761 }
13762
13763 for (int BPI = 0; BPI < 2; BPI++) {
13764 std::pair<ByteProvider<SDValue>, ByteProvider<SDValue>> BPP = {Src0, Src1};
13765 if (BPI == 1) {
13766 BPP = {Src1, Src0};
13767 }
13768 unsigned ZeroMask = 0x0c0c0c0c;
13769 unsigned FMask = 0xFF << (8 * (3 - Step));
13770
13771 unsigned FirstMask =
13772 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
13773 unsigned SecondMask =
13774 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
13775 // Attempt to find Src vector which contains our SDValue, if so, add our
13776 // perm mask to the existing one. If we are unable to find a match for the
13777 // first SDValue, attempt to find match for the second.
13778 int FirstGroup = -1;
13779 for (int I = 0; I < 2; I++) {
13780 SmallVectorImpl<DotSrc> &Srcs = I == 0 ? Src0s : Src1s;
13781 auto MatchesFirst = [&BPP](DotSrc &IterElt) {
13782 return IterElt.SrcOp == *BPP.first.Src &&
13783 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
13784 };
13785
13786 auto Match = llvm::find_if(Srcs, MatchesFirst);
13787 if (Match != Srcs.end()) {
13788 Match->PermMask = addPermMasks(FirstMask, Match->PermMask);
13789 FirstGroup = I;
13790 break;
13791 }
13792 }
13793 if (FirstGroup != -1) {
13794 SmallVectorImpl<DotSrc> &Srcs = FirstGroup == 1 ? Src0s : Src1s;
13795 auto MatchesSecond = [&BPP](DotSrc &IterElt) {
13796 return IterElt.SrcOp == *BPP.second.Src &&
13797 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
13798 };
13799 auto Match = llvm::find_if(Srcs, MatchesSecond);
13800 if (Match != Srcs.end()) {
13801 Match->PermMask = addPermMasks(SecondMask, Match->PermMask);
13802 } else
13803 Srcs.push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
13804 return;
13805 }
13806 }
13807
13808 // If we have made it here, then we could not find a match in Src0s or Src1s
13809 // for either Src0 or Src1, so just place them arbitrarily.
13810
13811 unsigned ZeroMask = 0x0c0c0c0c;
13812 unsigned FMask = 0xFF << (8 * (3 - Step));
13813
13814 Src0s.push_back(
13815 {*Src0.Src,
13816 ((Src0.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
13817 Src1.SrcOffset / 4});
13818 Src1s.push_back(
13819 {*Src1.Src,
13820 ((Src1.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
13821 Src1.SrcOffset / 4});
13822
13823 return;
13824}
13825
13827 SmallVectorImpl<DotSrc> &Srcs, bool IsSigned,
13828 bool IsAny) {
13829
13830 // If we just have one source, just permute it accordingly.
13831 if (Srcs.size() == 1) {
13832 auto Elt = Srcs.begin();
13833 auto EltOp = getDWordFromOffset(DAG, SL, Elt->SrcOp, Elt->DWordOffset);
13834
13835 // v_perm will produce the original value
13836 if (Elt->PermMask == 0x3020100)
13837 return EltOp;
13838
13839 return DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
13840 DAG.getConstant(Elt->PermMask, SL, MVT::i32));
13841 }
13842
13843 auto FirstElt = Srcs.begin();
13844 auto SecondElt = std::next(FirstElt);
13845
13847
13848 // If we have multiple sources in the chain, combine them via perms (using
13849 // calculated perm mask) and Ors.
13850 while (true) {
13851 auto FirstMask = FirstElt->PermMask;
13852 auto SecondMask = SecondElt->PermMask;
13853
13854 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
13855 unsigned FirstPlusFour = FirstMask | 0x04040404;
13856 // 0x0c + 0x04 = 0x10, so anding with 0x0F will produced 0x00 for any
13857 // original 0x0C.
13858 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
13859
13860 auto PermMask = addPermMasks(FirstMask, SecondMask);
13861 auto FirstVal =
13862 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
13863 auto SecondVal =
13864 getDWordFromOffset(DAG, SL, SecondElt->SrcOp, SecondElt->DWordOffset);
13865
13866 Perms.push_back(DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, FirstVal,
13867 SecondVal,
13868 DAG.getConstant(PermMask, SL, MVT::i32)));
13869
13870 FirstElt = std::next(SecondElt);
13871 if (FirstElt == Srcs.end())
13872 break;
13873
13874 SecondElt = std::next(FirstElt);
13875 // If we only have a FirstElt, then just combine that into the cumulative
13876 // source node.
13877 if (SecondElt == Srcs.end()) {
13878 auto EltOp =
13879 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
13880
13881 Perms.push_back(
13882 DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
13883 DAG.getConstant(FirstElt->PermMask, SL, MVT::i32)));
13884 break;
13885 }
13886 }
13887
13888 assert(Perms.size() == 1 || Perms.size() == 2);
13889 return Perms.size() == 2
13890 ? DAG.getNode(ISD::OR, SL, MVT::i32, Perms[0], Perms[1])
13891 : Perms[0];
13892}
13893
13894static void fixMasks(SmallVectorImpl<DotSrc> &Srcs, unsigned ChainLength) {
13895 for (auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
13896 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
13897 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
13898 EntryMask += ZeroMask;
13899 }
13900}
13901
13902static bool isMul(const SDValue Op) {
13903 auto Opcode = Op.getOpcode();
13904
13905 return (Opcode == ISD::MUL || Opcode == AMDGPUISD::MUL_U24 ||
13906 Opcode == AMDGPUISD::MUL_I24);
13907}
13908
13909static std::optional<bool>
13911 ByteProvider<SDValue> &Src1, const SDValue &S0Op,
13912 const SDValue &S1Op, const SelectionDAG &DAG) {
13913 // If we both ops are i8s (pre legalize-dag), then the signedness semantics
13914 // of the dot4 is irrelevant.
13915 if (S0Op.getValueSizeInBits() == 8 && S1Op.getValueSizeInBits() == 8)
13916 return false;
13917
13918 auto Known0 = DAG.computeKnownBits(S0Op, 0);
13919 bool S0IsUnsigned = Known0.countMinLeadingZeros() > 0;
13920 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
13921 auto Known1 = DAG.computeKnownBits(S1Op, 0);
13922 bool S1IsUnsigned = Known1.countMinLeadingZeros() > 0;
13923 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
13924
13925 assert(!(S0IsUnsigned && S0IsSigned));
13926 assert(!(S1IsUnsigned && S1IsSigned));
13927
13928 // There are 9 possible permutations of
13929 // {S0IsUnsigned, S0IsSigned, S1IsUnsigned, S1IsSigned}
13930
13931 // In two permutations, the sign bits are known to be the same for both Ops,
13932 // so simply return Signed / Unsigned corresponding to the MSB
13933
13934 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
13935 return S0IsSigned;
13936
13937 // In another two permutations, the sign bits are known to be opposite. In
13938 // this case return std::nullopt to indicate a bad match.
13939
13940 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
13941 return std::nullopt;
13942
13943 // In the remaining five permutations, we don't know the value of the sign
13944 // bit for at least one Op. Since we have a valid ByteProvider, we know that
13945 // the upper bits must be extension bits. Thus, the only ways for the sign
13946 // bit to be unknown is if it was sign extended from unknown value, or if it
13947 // was any extended. In either case, it is correct to use the signed
13948 // version of the signedness semantics of dot4
13949
13950 // In two of such permutations, we known the sign bit is set for
13951 // one op, and the other is unknown. It is okay to used signed version of
13952 // dot4.
13953 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
13954 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
13955 return true;
13956
13957 // In one such permutation, we don't know either of the sign bits. It is okay
13958 // to used the signed version of dot4.
13959 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
13960 return true;
13961
13962 // In two of such permutations, we known the sign bit is unset for
13963 // one op, and the other is unknown. Return std::nullopt to indicate a
13964 // bad match.
13965 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
13966 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
13967 return std::nullopt;
13968
13969 llvm_unreachable("Fully covered condition");
13970}
13971
13972SDValue SITargetLowering::performAddCombine(SDNode *N,
13973 DAGCombinerInfo &DCI) const {
13974 SelectionDAG &DAG = DCI.DAG;
13975 EVT VT = N->getValueType(0);
13976 SDLoc SL(N);
13977 SDValue LHS = N->getOperand(0);
13978 SDValue RHS = N->getOperand(1);
13979
13980 if (LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL) {
13981 if (Subtarget->hasMad64_32()) {
13982 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
13983 return Folded;
13984 }
13985 }
13986
13987 if (SDValue V = reassociateScalarOps(N, DAG)) {
13988 return V;
13989 }
13990
13991 if ((isMul(LHS) || isMul(RHS)) && Subtarget->hasDot7Insts() &&
13992 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
13993 SDValue TempNode(N, 0);
13994 std::optional<bool> IsSigned;
13998
13999 // Match the v_dot4 tree, while collecting src nodes.
14000 int ChainLength = 0;
14001 for (int I = 0; I < 4; I++) {
14002 auto MulIdx = isMul(LHS) ? 0 : isMul(RHS) ? 1 : -1;
14003 if (MulIdx == -1)
14004 break;
14005 auto Src0 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
14006 if (!Src0)
14007 break;
14008 auto Src1 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
14009 if (!Src1)
14010 break;
14011
14012 auto IterIsSigned = checkDot4MulSignedness(
14013 TempNode->getOperand(MulIdx), *Src0, *Src1,
14014 TempNode->getOperand(MulIdx)->getOperand(0),
14015 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
14016 if (!IterIsSigned)
14017 break;
14018 if (!IsSigned)
14019 IsSigned = *IterIsSigned;
14020 if (*IterIsSigned != *IsSigned)
14021 break;
14022 placeSources(*Src0, *Src1, Src0s, Src1s, I);
14023 auto AddIdx = 1 - MulIdx;
14024 // Allow the special case where add (add (mul24, 0), mul24) became ->
14025 // add (mul24, mul24).
14026 if (I == 2 && isMul(TempNode->getOperand(AddIdx))) {
14027 Src2s.push_back(TempNode->getOperand(AddIdx));
14028 auto Src0 =
14029 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(0));
14030 if (!Src0)
14031 break;
14032 auto Src1 =
14033 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(1));
14034 if (!Src1)
14035 break;
14036 auto IterIsSigned = checkDot4MulSignedness(
14037 TempNode->getOperand(AddIdx), *Src0, *Src1,
14038 TempNode->getOperand(AddIdx)->getOperand(0),
14039 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
14040 if (!IterIsSigned)
14041 break;
14042 assert(IsSigned);
14043 if (*IterIsSigned != *IsSigned)
14044 break;
14045 placeSources(*Src0, *Src1, Src0s, Src1s, I + 1);
14046 Src2s.push_back(DAG.getConstant(0, SL, MVT::i32));
14047 ChainLength = I + 2;
14048 break;
14049 }
14050
14051 TempNode = TempNode->getOperand(AddIdx);
14052 Src2s.push_back(TempNode);
14053 ChainLength = I + 1;
14054 if (TempNode->getNumOperands() < 2)
14055 break;
14056 LHS = TempNode->getOperand(0);
14057 RHS = TempNode->getOperand(1);
14058 }
14059
14060 if (ChainLength < 2)
14061 return SDValue();
14062
14063 // Masks were constructed with assumption that we would find a chain of
14064 // length 4. If not, then we need to 0 out the MSB bits (via perm mask of
14065 // 0x0c) so they do not affect dot calculation.
14066 if (ChainLength < 4) {
14067 fixMasks(Src0s, ChainLength);
14068 fixMasks(Src1s, ChainLength);
14069 }
14070
14071 SDValue Src0, Src1;
14072
14073 // If we are just using a single source for both, and have permuted the
14074 // bytes consistently, we can just use the sources without permuting
14075 // (commutation).
14076 bool UseOriginalSrc = false;
14077 if (ChainLength == 4 && Src0s.size() == 1 && Src1s.size() == 1 &&
14078 Src0s.begin()->PermMask == Src1s.begin()->PermMask &&
14079 Src0s.begin()->SrcOp.getValueSizeInBits() >= 32 &&
14080 Src1s.begin()->SrcOp.getValueSizeInBits() >= 32) {
14081 SmallVector<unsigned, 4> SrcBytes;
14082 auto Src0Mask = Src0s.begin()->PermMask;
14083 SrcBytes.push_back(Src0Mask & 0xFF000000);
14084 bool UniqueEntries = true;
14085 for (auto I = 1; I < 4; I++) {
14086 auto NextByte = Src0Mask & (0xFF << ((3 - I) * 8));
14087
14088 if (is_contained(SrcBytes, NextByte)) {
14089 UniqueEntries = false;
14090 break;
14091 }
14092 SrcBytes.push_back(NextByte);
14093 }
14094
14095 if (UniqueEntries) {
14096 UseOriginalSrc = true;
14097
14098 auto FirstElt = Src0s.begin();
14099 auto FirstEltOp =
14100 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
14101
14102 auto SecondElt = Src1s.begin();
14103 auto SecondEltOp = getDWordFromOffset(DAG, SL, SecondElt->SrcOp,
14104 SecondElt->DWordOffset);
14105
14106 Src0 = DAG.getBitcastedAnyExtOrTrunc(FirstEltOp, SL,
14107 MVT::getIntegerVT(32));
14108 Src1 = DAG.getBitcastedAnyExtOrTrunc(SecondEltOp, SL,
14109 MVT::getIntegerVT(32));
14110 }
14111 }
14112
14113 if (!UseOriginalSrc) {
14114 Src0 = resolveSources(DAG, SL, Src0s, false, true);
14115 Src1 = resolveSources(DAG, SL, Src1s, false, true);
14116 }
14117
14118 assert(IsSigned);
14119 SDValue Src2 =
14120 DAG.getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
14121
14122 SDValue IID = DAG.getTargetConstant(*IsSigned ? Intrinsic::amdgcn_sdot4
14123 : Intrinsic::amdgcn_udot4,
14124 SL, MVT::i64);
14125
14126 assert(!VT.isVector());
14127 auto Dot = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, IID, Src0,
14128 Src1, Src2, DAG.getTargetConstant(0, SL, MVT::i1));
14129
14130 return DAG.getExtOrTrunc(*IsSigned, Dot, SL, VT);
14131 }
14132
14133 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
14134 return SDValue();
14135
14136 // add x, zext (setcc) => uaddo_carry x, 0, setcc
14137 // add x, sext (setcc) => usubo_carry x, 0, setcc
14138 unsigned Opc = LHS.getOpcode();
14139 if (Opc == ISD::ZERO_EXTEND || Opc == ISD::SIGN_EXTEND ||
14140 Opc == ISD::ANY_EXTEND || Opc == ISD::UADDO_CARRY)
14141 std::swap(RHS, LHS);
14142
14143 Opc = RHS.getOpcode();
14144 switch (Opc) {
14145 default: break;
14146 case ISD::ZERO_EXTEND:
14147 case ISD::SIGN_EXTEND:
14148 case ISD::ANY_EXTEND: {
14149 auto Cond = RHS.getOperand(0);
14150 // If this won't be a real VOPC output, we would still need to insert an
14151 // extra instruction anyway.
14152 if (!isBoolSGPR(Cond))
14153 break;
14154 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
14155 SDValue Args[] = { LHS, DAG.getConstant(0, SL, MVT::i32), Cond };
14157 return DAG.getNode(Opc, SL, VTList, Args);
14158 }
14159 case ISD::UADDO_CARRY: {
14160 // add x, (uaddo_carry y, 0, cc) => uaddo_carry x, y, cc
14161 if (!isNullConstant(RHS.getOperand(1)))
14162 break;
14163 SDValue Args[] = { LHS, RHS.getOperand(0), RHS.getOperand(2) };
14164 return DAG.getNode(ISD::UADDO_CARRY, SDLoc(N), RHS->getVTList(), Args);
14165 }
14166 }
14167 return SDValue();
14168}
14169
14170SDValue SITargetLowering::performSubCombine(SDNode *N,
14171 DAGCombinerInfo &DCI) const {
14172 SelectionDAG &DAG = DCI.DAG;
14173 EVT VT = N->getValueType(0);
14174
14175 if (VT != MVT::i32)
14176 return SDValue();
14177
14178 SDLoc SL(N);
14179 SDValue LHS = N->getOperand(0);
14180 SDValue RHS = N->getOperand(1);
14181
14182 // sub x, zext (setcc) => usubo_carry x, 0, setcc
14183 // sub x, sext (setcc) => uaddo_carry x, 0, setcc
14184 unsigned Opc = RHS.getOpcode();
14185 switch (Opc) {
14186 default: break;
14187 case ISD::ZERO_EXTEND:
14188 case ISD::SIGN_EXTEND:
14189 case ISD::ANY_EXTEND: {
14190 auto Cond = RHS.getOperand(0);
14191 // If this won't be a real VOPC output, we would still need to insert an
14192 // extra instruction anyway.
14193 if (!isBoolSGPR(Cond))
14194 break;
14195 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
14196 SDValue Args[] = { LHS, DAG.getConstant(0, SL, MVT::i32), Cond };
14198 return DAG.getNode(Opc, SL, VTList, Args);
14199 }
14200 }
14201
14202 if (LHS.getOpcode() == ISD::USUBO_CARRY) {
14203 // sub (usubo_carry x, 0, cc), y => usubo_carry x, y, cc
14204 if (!isNullConstant(LHS.getOperand(1)))
14205 return SDValue();
14206 SDValue Args[] = { LHS.getOperand(0), RHS, LHS.getOperand(2) };
14207 return DAG.getNode(ISD::USUBO_CARRY, SDLoc(N), LHS->getVTList(), Args);
14208 }
14209 return SDValue();
14210}
14211
14212SDValue SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
14213 DAGCombinerInfo &DCI) const {
14214
14215 if (N->getValueType(0) != MVT::i32)
14216 return SDValue();
14217
14218 if (!isNullConstant(N->getOperand(1)))
14219 return SDValue();
14220
14221 SelectionDAG &DAG = DCI.DAG;
14222 SDValue LHS = N->getOperand(0);
14223
14224 // uaddo_carry (add x, y), 0, cc => uaddo_carry x, y, cc
14225 // usubo_carry (sub x, y), 0, cc => usubo_carry x, y, cc
14226 unsigned LHSOpc = LHS.getOpcode();
14227 unsigned Opc = N->getOpcode();
14228 if ((LHSOpc == ISD::ADD && Opc == ISD::UADDO_CARRY) ||
14229 (LHSOpc == ISD::SUB && Opc == ISD::USUBO_CARRY)) {
14230 SDValue Args[] = { LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2) };
14231 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args);
14232 }
14233 return SDValue();
14234}
14235
14236SDValue SITargetLowering::performFAddCombine(SDNode *N,
14237 DAGCombinerInfo &DCI) const {
14238 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
14239 return SDValue();
14240
14241 SelectionDAG &DAG = DCI.DAG;
14242 EVT VT = N->getValueType(0);
14243
14244 SDLoc SL(N);
14245 SDValue LHS = N->getOperand(0);
14246 SDValue RHS = N->getOperand(1);
14247
14248 // These should really be instruction patterns, but writing patterns with
14249 // source modifiers is a pain.
14250
14251 // fadd (fadd (a, a), b) -> mad 2.0, a, b
14252 if (LHS.getOpcode() == ISD::FADD) {
14253 SDValue A = LHS.getOperand(0);
14254 if (A == LHS.getOperand(1)) {
14255 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
14256 if (FusedOp != 0) {
14257 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
14258 return DAG.getNode(FusedOp, SL, VT, A, Two, RHS);
14259 }
14260 }
14261 }
14262
14263 // fadd (b, fadd (a, a)) -> mad 2.0, a, b
14264 if (RHS.getOpcode() == ISD::FADD) {
14265 SDValue A = RHS.getOperand(0);
14266 if (A == RHS.getOperand(1)) {
14267 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
14268 if (FusedOp != 0) {
14269 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
14270 return DAG.getNode(FusedOp, SL, VT, A, Two, LHS);
14271 }
14272 }
14273 }
14274
14275 return SDValue();
14276}
14277
14278SDValue SITargetLowering::performFSubCombine(SDNode *N,
14279 DAGCombinerInfo &DCI) const {
14280 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
14281 return SDValue();
14282
14283 SelectionDAG &DAG = DCI.DAG;
14284 SDLoc SL(N);
14285 EVT VT = N->getValueType(0);
14286 assert(!VT.isVector());
14287
14288 // Try to get the fneg to fold into the source modifier. This undoes generic
14289 // DAG combines and folds them into the mad.
14290 //
14291 // Only do this if we are not trying to support denormals. v_mad_f32 does
14292 // not support denormals ever.
14293 SDValue LHS = N->getOperand(0);
14294 SDValue RHS = N->getOperand(1);
14295 if (LHS.getOpcode() == ISD::FADD) {
14296 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
14297 SDValue A = LHS.getOperand(0);
14298 if (A == LHS.getOperand(1)) {
14299 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
14300 if (FusedOp != 0){
14301 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
14302 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
14303
14304 return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS);
14305 }
14306 }
14307 }
14308
14309 if (RHS.getOpcode() == ISD::FADD) {
14310 // (fsub c, (fadd a, a)) -> mad -2.0, a, c
14311
14312 SDValue A = RHS.getOperand(0);
14313 if (A == RHS.getOperand(1)) {
14314 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
14315 if (FusedOp != 0){
14316 const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT);
14317 return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS);
14318 }
14319 }
14320 }
14321
14322 return SDValue();
14323}
14324
14325SDValue SITargetLowering::performFDivCombine(SDNode *N,
14326 DAGCombinerInfo &DCI) const {
14327 SelectionDAG &DAG = DCI.DAG;
14328 SDLoc SL(N);
14329 EVT VT = N->getValueType(0);
14330 if (VT != MVT::f16 || !Subtarget->has16BitInsts())
14331 return SDValue();
14332
14333 SDValue LHS = N->getOperand(0);
14334 SDValue RHS = N->getOperand(1);
14335
14336 SDNodeFlags Flags = N->getFlags();
14337 SDNodeFlags RHSFlags = RHS->getFlags();
14338 if (!Flags.hasAllowContract() || !RHSFlags.hasAllowContract() ||
14339 !RHS->hasOneUse())
14340 return SDValue();
14341
14342 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
14343 bool IsNegative = false;
14344 if (CLHS->isExactlyValue(1.0) ||
14345 (IsNegative = CLHS->isExactlyValue(-1.0))) {
14346 // fdiv contract 1.0, (sqrt contract x) -> rsq for f16
14347 // fdiv contract -1.0, (sqrt contract x) -> fneg(rsq) for f16
14348 if (RHS.getOpcode() == ISD::FSQRT) {
14349 // TODO: Or in RHS flags, somehow missing from SDNodeFlags
14350 SDValue Rsq =
14351 DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0), Flags);
14352 return IsNegative ? DAG.getNode(ISD::FNEG, SL, VT, Rsq, Flags) : Rsq;
14353 }
14354 }
14355 }
14356
14357 return SDValue();
14358}
14359
14360SDValue SITargetLowering::performFMACombine(SDNode *N,
14361 DAGCombinerInfo &DCI) const {
14362 SelectionDAG &DAG = DCI.DAG;
14363 EVT VT = N->getValueType(0);
14364 SDLoc SL(N);
14365
14366 if (!Subtarget->hasDot7Insts() || VT != MVT::f32)
14367 return SDValue();
14368
14369 // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
14370 // FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
14371 SDValue Op1 = N->getOperand(0);
14372 SDValue Op2 = N->getOperand(1);
14373 SDValue FMA = N->getOperand(2);
14374
14375 if (FMA.getOpcode() != ISD::FMA ||
14376 Op1.getOpcode() != ISD::FP_EXTEND ||
14377 Op2.getOpcode() != ISD::FP_EXTEND)
14378 return SDValue();
14379
14380 // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
14381 // regardless of the denorm mode setting. Therefore,
14382 // unsafe-fp-math/fp-contract is sufficient to allow generating fdot2.
14383 const TargetOptions &Options = DAG.getTarget().Options;
14384 if (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
14385 (N->getFlags().hasAllowContract() &&
14386 FMA->getFlags().hasAllowContract())) {
14387 Op1 = Op1.getOperand(0);
14388 Op2 = Op2.getOperand(0);
14389 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
14391 return SDValue();
14392
14393 SDValue Vec1 = Op1.getOperand(0);
14394 SDValue Idx1 = Op1.getOperand(1);
14395 SDValue Vec2 = Op2.getOperand(0);
14396
14397 SDValue FMAOp1 = FMA.getOperand(0);
14398 SDValue FMAOp2 = FMA.getOperand(1);
14399 SDValue FMAAcc = FMA.getOperand(2);
14400
14401 if (FMAOp1.getOpcode() != ISD::FP_EXTEND ||
14402 FMAOp2.getOpcode() != ISD::FP_EXTEND)
14403 return SDValue();
14404
14405 FMAOp1 = FMAOp1.getOperand(0);
14406 FMAOp2 = FMAOp2.getOperand(0);
14407 if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
14409 return SDValue();
14410
14411 SDValue Vec3 = FMAOp1.getOperand(0);
14412 SDValue Vec4 = FMAOp2.getOperand(0);
14413 SDValue Idx2 = FMAOp1.getOperand(1);
14414
14415 if (Idx1 != Op2.getOperand(1) || Idx2 != FMAOp2.getOperand(1) ||
14416 // Idx1 and Idx2 cannot be the same.
14417 Idx1 == Idx2)
14418 return SDValue();
14419
14420 if (Vec1 == Vec2 || Vec3 == Vec4)
14421 return SDValue();
14422
14423 if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16)
14424 return SDValue();
14425
14426 if ((Vec1 == Vec3 && Vec2 == Vec4) ||
14427 (Vec1 == Vec4 && Vec2 == Vec3)) {
14428 return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
14429 DAG.getTargetConstant(0, SL, MVT::i1));
14430 }
14431 }
14432 return SDValue();
14433}
14434
14435SDValue SITargetLowering::performSetCCCombine(SDNode *N,
14436 DAGCombinerInfo &DCI) const {
14437 SelectionDAG &DAG = DCI.DAG;
14438 SDLoc SL(N);
14439
14440 SDValue LHS = N->getOperand(0);
14441 SDValue RHS = N->getOperand(1);
14442 EVT VT = LHS.getValueType();
14443 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
14444
14445 auto CRHS = dyn_cast<ConstantSDNode>(RHS);
14446 if (!CRHS) {
14447 CRHS = dyn_cast<ConstantSDNode>(LHS);
14448 if (CRHS) {
14449 std::swap(LHS, RHS);
14451 }
14452 }
14453
14454 if (CRHS) {
14455 if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
14456 isBoolSGPR(LHS.getOperand(0))) {
14457 // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
14458 // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
14459 // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
14460 // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc
14461 if ((CRHS->isAllOnes() &&
14462 (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
14463 (CRHS->isZero() &&
14464 (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
14465 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
14466 DAG.getConstant(-1, SL, MVT::i1));
14467 if ((CRHS->isAllOnes() &&
14468 (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
14469 (CRHS->isZero() &&
14470 (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
14471 return LHS.getOperand(0);
14472 }
14473
14474 const APInt &CRHSVal = CRHS->getAPIntValue();
14475 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
14476 LHS.getOpcode() == ISD::SELECT &&
14477 isa<ConstantSDNode>(LHS.getOperand(1)) &&
14478 isa<ConstantSDNode>(LHS.getOperand(2)) &&
14479 LHS.getConstantOperandVal(1) != LHS.getConstantOperandVal(2) &&
14480 isBoolSGPR(LHS.getOperand(0))) {
14481 // Given CT != FT:
14482 // setcc (select cc, CT, CF), CF, eq => xor cc, -1
14483 // setcc (select cc, CT, CF), CF, ne => cc
14484 // setcc (select cc, CT, CF), CT, ne => xor cc, -1
14485 // setcc (select cc, CT, CF), CT, eq => cc
14486 const APInt &CT = LHS.getConstantOperandAPInt(1);
14487 const APInt &CF = LHS.getConstantOperandAPInt(2);
14488
14489 if ((CF == CRHSVal && CC == ISD::SETEQ) ||
14490 (CT == CRHSVal && CC == ISD::SETNE))
14491 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
14492 DAG.getConstant(-1, SL, MVT::i1));
14493 if ((CF == CRHSVal && CC == ISD::SETNE) ||
14494 (CT == CRHSVal && CC == ISD::SETEQ))
14495 return LHS.getOperand(0);
14496 }
14497 }
14498
14499 if (VT != MVT::f32 && VT != MVT::f64 &&
14500 (!Subtarget->has16BitInsts() || VT != MVT::f16))
14501 return SDValue();
14502
14503 // Match isinf/isfinite pattern
14504 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
14505 // (fcmp one (fabs x), inf) -> (fp_class x,
14506 // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)
14507 if ((CC == ISD::SETOEQ || CC == ISD::SETONE) && LHS.getOpcode() == ISD::FABS) {
14508 const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
14509 if (!CRHS)
14510 return SDValue();
14511
14512 const APFloat &APF = CRHS->getValueAPF();
14513 if (APF.isInfinity() && !APF.isNegative()) {
14514 const unsigned IsInfMask = SIInstrFlags::P_INFINITY |
14516 const unsigned IsFiniteMask = SIInstrFlags::N_ZERO |
14522 unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask;
14523 return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
14524 DAG.getConstant(Mask, SL, MVT::i32));
14525 }
14526 }
14527
14528 return SDValue();
14529}
14530
14531SDValue SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
14532 DAGCombinerInfo &DCI) const {
14533 SelectionDAG &DAG = DCI.DAG;
14534 SDLoc SL(N);
14535 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
14536
14537 SDValue Src = N->getOperand(0);
14538 SDValue Shift = N->getOperand(0);
14539
14540 // TODO: Extend type shouldn't matter (assuming legal types).
14541 if (Shift.getOpcode() == ISD::ZERO_EXTEND)
14542 Shift = Shift.getOperand(0);
14543
14544 if (Shift.getOpcode() == ISD::SRL || Shift.getOpcode() == ISD::SHL) {
14545 // cvt_f32_ubyte1 (shl x, 8) -> cvt_f32_ubyte0 x
14546 // cvt_f32_ubyte3 (shl x, 16) -> cvt_f32_ubyte1 x
14547 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
14548 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
14549 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
14550 if (auto *C = dyn_cast<ConstantSDNode>(Shift.getOperand(1))) {
14551 SDValue Shifted = DAG.getZExtOrTrunc(Shift.getOperand(0),
14552 SDLoc(Shift.getOperand(0)), MVT::i32);
14553
14554 unsigned ShiftOffset = 8 * Offset;
14555 if (Shift.getOpcode() == ISD::SHL)
14556 ShiftOffset -= C->getZExtValue();
14557 else
14558 ShiftOffset += C->getZExtValue();
14559
14560 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
14561 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL,
14562 MVT::f32, Shifted);
14563 }
14564 }
14565 }
14566
14567 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14568 APInt DemandedBits = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
14569 if (TLI.SimplifyDemandedBits(Src, DemandedBits, DCI)) {
14570 // We simplified Src. If this node is not dead, visit it again so it is
14571 // folded properly.
14572 if (N->getOpcode() != ISD::DELETED_NODE)
14573 DCI.AddToWorklist(N);
14574 return SDValue(N, 0);
14575 }
14576
14577 // Handle (or x, (srl y, 8)) pattern when known bits are zero.
14578 if (SDValue DemandedSrc =
14580 return DAG.getNode(N->getOpcode(), SL, MVT::f32, DemandedSrc);
14581
14582 return SDValue();
14583}
14584
14585SDValue SITargetLowering::performClampCombine(SDNode *N,
14586 DAGCombinerInfo &DCI) const {
14587 ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
14588 if (!CSrc)
14589 return SDValue();
14590
14591 const MachineFunction &MF = DCI.DAG.getMachineFunction();
14592 const APFloat &F = CSrc->getValueAPF();
14593 APFloat Zero = APFloat::getZero(F.getSemantics());
14594 if (F < Zero ||
14595 (F.isNaN() && MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
14596 return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
14597 }
14598
14599 APFloat One(F.getSemantics(), "1.0");
14600 if (F > One)
14601 return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
14602
14603 return SDValue(CSrc, 0);
14604}
14605
14606
14608 DAGCombinerInfo &DCI) const {
14609 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
14610 return SDValue();
14611 switch (N->getOpcode()) {
14612 case ISD::ADD:
14613 return performAddCombine(N, DCI);
14614 case ISD::SUB:
14615 return performSubCombine(N, DCI);
14616 case ISD::UADDO_CARRY:
14617 case ISD::USUBO_CARRY:
14618 return performAddCarrySubCarryCombine(N, DCI);
14619 case ISD::FADD:
14620 return performFAddCombine(N, DCI);
14621 case ISD::FSUB:
14622 return performFSubCombine(N, DCI);
14623 case ISD::FDIV:
14624 return performFDivCombine(N, DCI);
14625 case ISD::SETCC:
14626 return performSetCCCombine(N, DCI);
14627 case ISD::FMAXNUM:
14628 case ISD::FMINNUM:
14629 case ISD::FMAXNUM_IEEE:
14630 case ISD::FMINNUM_IEEE:
14631 case ISD::FMAXIMUM:
14632 case ISD::FMINIMUM:
14633 case ISD::SMAX:
14634 case ISD::SMIN:
14635 case ISD::UMAX:
14636 case ISD::UMIN:
14639 return performMinMaxCombine(N, DCI);
14640 case ISD::FMA:
14641 return performFMACombine(N, DCI);
14642 case ISD::AND:
14643 return performAndCombine(N, DCI);
14644 case ISD::OR:
14645 return performOrCombine(N, DCI);
14646 case ISD::FSHR: {
14648 if (N->getValueType(0) == MVT::i32 && N->isDivergent() &&
14649 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
14650 return matchPERM(N, DCI);
14651 }
14652 break;
14653 }
14654 case ISD::XOR:
14655 return performXorCombine(N, DCI);
14656 case ISD::ZERO_EXTEND:
14657 return performZeroExtendCombine(N, DCI);
14659 return performSignExtendInRegCombine(N , DCI);
14661 return performClassCombine(N, DCI);
14662 case ISD::FCANONICALIZE:
14663 return performFCanonicalizeCombine(N, DCI);
14664 case AMDGPUISD::RCP:
14665 return performRcpCombine(N, DCI);
14666 case ISD::FLDEXP:
14667 case AMDGPUISD::FRACT:
14668 case AMDGPUISD::RSQ:
14671 case AMDGPUISD::RSQ_CLAMP: {
14672 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
14673 SDValue Src = N->getOperand(0);
14674 if (Src.isUndef())
14675 return Src;
14676 break;
14677 }
14678 case ISD::SINT_TO_FP:
14679 case ISD::UINT_TO_FP:
14680 return performUCharToFloatCombine(N, DCI);
14681 case ISD::FCOPYSIGN:
14682 return performFCopySignCombine(N, DCI);
14687 return performCvtF32UByteNCombine(N, DCI);
14688 case AMDGPUISD::FMED3:
14689 return performFMed3Combine(N, DCI);
14691 return performCvtPkRTZCombine(N, DCI);
14692 case AMDGPUISD::CLAMP:
14693 return performClampCombine(N, DCI);
14694 case ISD::SCALAR_TO_VECTOR: {
14695 SelectionDAG &DAG = DCI.DAG;
14696 EVT VT = N->getValueType(0);
14697
14698 // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
14699 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
14700 SDLoc SL(N);
14701 SDValue Src = N->getOperand(0);
14702 EVT EltVT = Src.getValueType();
14703 if (EltVT != MVT::i16)
14704 Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);
14705
14706 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);
14707 return DAG.getNode(ISD::BITCAST, SL, VT, Ext);
14708 }
14709
14710 break;
14711 }
14713 return performExtractVectorEltCombine(N, DCI);
14715 return performInsertVectorEltCombine(N, DCI);
14716 case ISD::FP_ROUND:
14717 return performFPRoundCombine(N, DCI);
14718 case ISD::LOAD: {
14719 if (SDValue Widened = widenLoad(cast<LoadSDNode>(N), DCI))
14720 return Widened;
14721 [[fallthrough]];
14722 }
14723 default: {
14724 if (!DCI.isBeforeLegalize()) {
14725 if (MemSDNode *MemNode = dyn_cast<MemSDNode>(N))
14726 return performMemSDNodeCombine(MemNode, DCI);
14727 }
14728
14729 break;
14730 }
14731 }
14732
14734}
14735
14736/// Helper function for adjustWritemask
14737static unsigned SubIdx2Lane(unsigned Idx) {
14738 switch (Idx) {
14739 default: return ~0u;
14740 case AMDGPU::sub0: return 0;
14741 case AMDGPU::sub1: return 1;
14742 case AMDGPU::sub2: return 2;
14743 case AMDGPU::sub3: return 3;
14744 case AMDGPU::sub4: return 4; // Possible with TFE/LWE
14745 }
14746}
14747
14748/// Adjust the writemask of MIMG, VIMAGE or VSAMPLE instructions
14749SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
14750 SelectionDAG &DAG) const {
14751 unsigned Opcode = Node->getMachineOpcode();
14752
14753 // Subtract 1 because the vdata output is not a MachineSDNode operand.
14754 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
14755 if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx))
14756 return Node; // not implemented for D16
14757
14758 SDNode *Users[5] = { nullptr };
14759 unsigned Lane = 0;
14760 unsigned DmaskIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
14761 unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
14762 unsigned NewDmask = 0;
14763 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
14764 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
14765 bool UsesTFC = ((int(TFEIdx) >= 0 && Node->getConstantOperandVal(TFEIdx)) ||
14766 (int(LWEIdx) >= 0 && Node->getConstantOperandVal(LWEIdx)))
14767 ? true
14768 : false;
14769 unsigned TFCLane = 0;
14770 bool HasChain = Node->getNumValues() > 1;
14771
14772 if (OldDmask == 0) {
14773 // These are folded out, but on the chance it happens don't assert.
14774 return Node;
14775 }
14776
14777 unsigned OldBitsSet = llvm::popcount(OldDmask);
14778 // Work out which is the TFE/LWE lane if that is enabled.
14779 if (UsesTFC) {
14780 TFCLane = OldBitsSet;
14781 }
14782
14783 // Try to figure out the used register components
14784 for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end();
14785 I != E; ++I) {
14786
14787 // Don't look at users of the chain.
14788 if (I.getUse().getResNo() != 0)
14789 continue;
14790
14791 // Abort if we can't understand the usage
14792 if (!I->isMachineOpcode() ||
14793 I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
14794 return Node;
14795
14796 // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
14797 // Note that subregs are packed, i.e. Lane==0 is the first bit set
14798 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
14799 // set, etc.
14800 Lane = SubIdx2Lane(I->getConstantOperandVal(1));
14801 if (Lane == ~0u)
14802 return Node;
14803
14804 // Check if the use is for the TFE/LWE generated result at VGPRn+1.
14805 if (UsesTFC && Lane == TFCLane) {
14806 Users[Lane] = *I;
14807 } else {
14808 // Set which texture component corresponds to the lane.
14809 unsigned Comp;
14810 for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
14811 Comp = llvm::countr_zero(Dmask);
14812 Dmask &= ~(1 << Comp);
14813 }
14814
14815 // Abort if we have more than one user per component.
14816 if (Users[Lane])
14817 return Node;
14818
14819 Users[Lane] = *I;
14820 NewDmask |= 1 << Comp;
14821 }
14822 }
14823
14824 // Don't allow 0 dmask, as hardware assumes one channel enabled.
14825 bool NoChannels = !NewDmask;
14826 if (NoChannels) {
14827 if (!UsesTFC) {
14828 // No uses of the result and not using TFC. Then do nothing.
14829 return Node;
14830 }
14831 // If the original dmask has one channel - then nothing to do
14832 if (OldBitsSet == 1)
14833 return Node;
14834 // Use an arbitrary dmask - required for the instruction to work
14835 NewDmask = 1;
14836 }
14837 // Abort if there's no change
14838 if (NewDmask == OldDmask)
14839 return Node;
14840
14841 unsigned BitsSet = llvm::popcount(NewDmask);
14842
14843 // Check for TFE or LWE - increase the number of channels by one to account
14844 // for the extra return value
14845 // This will need adjustment for D16 if this is also included in
14846 // adjustWriteMask (this function) but at present D16 are excluded.
14847 unsigned NewChannels = BitsSet + UsesTFC;
14848
14849 int NewOpcode =
14850 AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), NewChannels);
14851 assert(NewOpcode != -1 &&
14852 NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
14853 "failed to find equivalent MIMG op");
14854
14855 // Adjust the writemask in the node
14857 Ops.insert(Ops.end(), Node->op_begin(), Node->op_begin() + DmaskIdx);
14858 Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
14859 Ops.insert(Ops.end(), Node->op_begin() + DmaskIdx + 1, Node->op_end());
14860
14861 MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
14862
14863 MVT ResultVT = NewChannels == 1 ?
14864 SVT : MVT::getVectorVT(SVT, NewChannels == 3 ? 4 :
14865 NewChannels == 5 ? 8 : NewChannels);
14866 SDVTList NewVTList = HasChain ?
14867 DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT);
14868
14869
14870 MachineSDNode *NewNode = DAG.getMachineNode(NewOpcode, SDLoc(Node),
14871 NewVTList, Ops);
14872
14873 if (HasChain) {
14874 // Update chain.
14875 DAG.setNodeMemRefs(NewNode, Node->memoperands());
14876 DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
14877 }
14878
14879 if (NewChannels == 1) {
14880 assert(Node->hasNUsesOfValue(1, 0));
14881 SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY,
14882 SDLoc(Node), Users[Lane]->getValueType(0),
14883 SDValue(NewNode, 0));
14884 DAG.ReplaceAllUsesWith(Users[Lane], Copy);
14885 return nullptr;
14886 }
14887
14888 // Update the users of the node with the new indices
14889 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
14890 SDNode *User = Users[i];
14891 if (!User) {
14892 // Handle the special case of NoChannels. We set NewDmask to 1 above, but
14893 // Users[0] is still nullptr because channel 0 doesn't really have a use.
14894 if (i || !NoChannels)
14895 continue;
14896 } else {
14897 SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
14898 SDNode *NewUser = DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
14899 if (NewUser != User) {
14900 DAG.ReplaceAllUsesWith(SDValue(User, 0), SDValue(NewUser, 0));
14901 DAG.RemoveDeadNode(User);
14902 }
14903 }
14904
14905 switch (Idx) {
14906 default: break;
14907 case AMDGPU::sub0: Idx = AMDGPU::sub1; break;
14908 case AMDGPU::sub1: Idx = AMDGPU::sub2; break;
14909 case AMDGPU::sub2: Idx = AMDGPU::sub3; break;
14910 case AMDGPU::sub3: Idx = AMDGPU::sub4; break;
14911 }
14912 }
14913
14914 DAG.RemoveDeadNode(Node);
14915 return nullptr;
14916}
14917
14919 if (Op.getOpcode() == ISD::AssertZext)
14920 Op = Op.getOperand(0);
14921
14922 return isa<FrameIndexSDNode>(Op);
14923}
14924
14925/// Legalize target independent instructions (e.g. INSERT_SUBREG)
14926/// with frame index operands.
14927/// LLVM assumes that inputs are to these instructions are registers.
14929 SelectionDAG &DAG) const {
14930 if (Node->getOpcode() == ISD::CopyToReg) {
14931 RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
14932 SDValue SrcVal = Node->getOperand(2);
14933
14934 // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
14935 // to try understanding copies to physical registers.
14936 if (SrcVal.getValueType() == MVT::i1 && DestReg->getReg().isPhysical()) {
14937 SDLoc SL(Node);
14939 SDValue VReg = DAG.getRegister(
14940 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
14941
14942 SDNode *Glued = Node->getGluedNode();
14943 SDValue ToVReg
14944 = DAG.getCopyToReg(Node->getOperand(0), SL, VReg, SrcVal,
14945 SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
14946 SDValue ToResultReg
14947 = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0),
14948 VReg, ToVReg.getValue(1));
14949 DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode());
14950 DAG.RemoveDeadNode(Node);
14951 return ToResultReg.getNode();
14952 }
14953 }
14954
14956 for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
14957 if (!isFrameIndexOp(Node->getOperand(i))) {
14958 Ops.push_back(Node->getOperand(i));
14959 continue;
14960 }
14961
14962 SDLoc DL(Node);
14963 Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
14964 Node->getOperand(i).getValueType(),
14965 Node->getOperand(i)), 0));
14966 }
14967
14968 return DAG.UpdateNodeOperands(Node, Ops);
14969}
14970
14971/// Fold the instructions after selecting them.
14972/// Returns null if users were already updated.
14974 SelectionDAG &DAG) const {
14976 unsigned Opcode = Node->getMachineOpcode();
14977
14978 if (TII->isImage(Opcode) && !TII->get(Opcode).mayStore() &&
14979 !TII->isGather4(Opcode) &&
14980 AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::dmask)) {
14981 return adjustWritemask(Node, DAG);
14982 }
14983
14984 if (Opcode == AMDGPU::INSERT_SUBREG ||
14985 Opcode == AMDGPU::REG_SEQUENCE) {
14987 return Node;
14988 }
14989
14990 switch (Opcode) {
14991 case AMDGPU::V_DIV_SCALE_F32_e64:
14992 case AMDGPU::V_DIV_SCALE_F64_e64: {
14993 // Satisfy the operand register constraint when one of the inputs is
14994 // undefined. Ordinarily each undef value will have its own implicit_def of
14995 // a vreg, so force these to use a single register.
14996 SDValue Src0 = Node->getOperand(1);
14997 SDValue Src1 = Node->getOperand(3);
14998 SDValue Src2 = Node->getOperand(5);
14999
15000 if ((Src0.isMachineOpcode() &&
15001 Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
15002 (Src0 == Src1 || Src0 == Src2))
15003 break;
15004
15005 MVT VT = Src0.getValueType().getSimpleVT();
15006 const TargetRegisterClass *RC =
15007 getRegClassFor(VT, Src0.getNode()->isDivergent());
15008
15010 SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);
15011
15012 SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node),
15013 UndefReg, Src0, SDValue());
15014
15015 // src0 must be the same register as src1 or src2, even if the value is
15016 // undefined, so make sure we don't violate this constraint.
15017 if (Src0.isMachineOpcode() &&
15018 Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
15019 if (Src1.isMachineOpcode() &&
15020 Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
15021 Src0 = Src1;
15022 else if (Src2.isMachineOpcode() &&
15023 Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
15024 Src0 = Src2;
15025 else {
15026 assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
15027 Src0 = UndefReg;
15028 Src1 = UndefReg;
15029 }
15030 } else
15031 break;
15032
15033 SmallVector<SDValue, 9> Ops(Node->op_begin(), Node->op_end());
15034 Ops[1] = Src0;
15035 Ops[3] = Src1;
15036 Ops[5] = Src2;
15037 Ops.push_back(ImpDef.getValue(1));
15038 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
15039 }
15040 default:
15041 break;
15042 }
15043
15044 return Node;
15045}
15046
15047// Any MIMG instructions that use tfe or lwe require an initialization of the
15048// result register that will be written in the case of a memory access failure.
15049// The required code is also added to tie this init code to the result of the
15050// img instruction.
15053 const SIRegisterInfo &TRI = TII->getRegisterInfo();
15054 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
15055 MachineBasicBlock &MBB = *MI.getParent();
15056
15057 int DstIdx =
15058 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
15059 unsigned InitIdx = 0;
15060
15061 if (TII->isImage(MI)) {
15062 MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe);
15063 MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe);
15064 MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16);
15065
15066 if (!TFE && !LWE) // intersect_ray
15067 return;
15068
15069 unsigned TFEVal = TFE ? TFE->getImm() : 0;
15070 unsigned LWEVal = LWE ? LWE->getImm() : 0;
15071 unsigned D16Val = D16 ? D16->getImm() : 0;
15072
15073 if (!TFEVal && !LWEVal)
15074 return;
15075
15076 // At least one of TFE or LWE are non-zero
15077 // We have to insert a suitable initialization of the result value and
15078 // tie this to the dest of the image instruction.
15079
15080 // Calculate which dword we have to initialize to 0.
15081 MachineOperand *MO_Dmask = TII->getNamedOperand(MI, AMDGPU::OpName::dmask);
15082
15083 // check that dmask operand is found.
15084 assert(MO_Dmask && "Expected dmask operand in instruction");
15085
15086 unsigned dmask = MO_Dmask->getImm();
15087 // Determine the number of active lanes taking into account the
15088 // Gather4 special case
15089 unsigned ActiveLanes = TII->isGather4(MI) ? 4 : llvm::popcount(dmask);
15090
15091 bool Packed = !Subtarget->hasUnpackedD16VMem();
15092
15093 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
15094
15095 // Abandon attempt if the dst size isn't large enough
15096 // - this is in fact an error but this is picked up elsewhere and
15097 // reported correctly.
15098 uint32_t DstSize =
15099 TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
15100 if (DstSize < InitIdx)
15101 return;
15102 } else if (TII->isMUBUF(MI) && AMDGPU::getMUBUFTfe(MI.getOpcode())) {
15103 InitIdx = TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
15104 } else {
15105 return;
15106 }
15107
15108 const DebugLoc &DL = MI.getDebugLoc();
15109
15110 // Create a register for the initialization value.
15111 Register PrevDst = MRI.cloneVirtualRegister(MI.getOperand(DstIdx).getReg());
15112 unsigned NewDst = 0; // Final initialized value will be in here
15113
15114 // If PRTStrictNull feature is enabled (the default) then initialize
15115 // all the result registers to 0, otherwise just the error indication
15116 // register (VGPRn+1)
15117 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
15118 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
15119
15120 BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst);
15121 for (; SizeLeft; SizeLeft--, CurrIdx++) {
15122 NewDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
15123 // Initialize dword
15124 Register SubReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
15125 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg)
15126 .addImm(0);
15127 // Insert into the super-reg
15128 BuildMI(MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst)
15129 .addReg(PrevDst)
15130 .addReg(SubReg)
15132
15133 PrevDst = NewDst;
15134 }
15135
15136 // Add as an implicit operand
15137 MI.addOperand(MachineOperand::CreateReg(NewDst, false, true));
15138
15139 // Tie the just added implicit operand to the dst
15140 MI.tieOperands(DstIdx, MI.getNumOperands() - 1);
15141}
15142
15143/// Assign the register class depending on the number of
15144/// bits set in the writemask
15146 SDNode *Node) const {
15148
15149 MachineFunction *MF = MI.getParent()->getParent();
15152
15153 if (TII->isVOP3(MI.getOpcode())) {
15154 // Make sure constant bus requirements are respected.
15155 TII->legalizeOperandsVOP3(MRI, MI);
15156
15157 // Prefer VGPRs over AGPRs in mAI instructions where possible.
15158 // This saves a chain-copy of registers and better balance register
15159 // use between vgpr and agpr as agpr tuples tend to be big.
15160 if (!MI.getDesc().operands().empty()) {
15161 unsigned Opc = MI.getOpcode();
15162 bool HasAGPRs = Info->mayNeedAGPRs();
15163 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
15164 int16_t Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
15165 for (auto I :
15166 {AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
15167 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), Src2Idx}) {
15168 if (I == -1)
15169 break;
15170 if ((I == Src2Idx) && (HasAGPRs))
15171 break;
15172 MachineOperand &Op = MI.getOperand(I);
15173 if (!Op.isReg() || !Op.getReg().isVirtual())
15174 continue;
15175 auto *RC = TRI->getRegClassForReg(MRI, Op.getReg());
15176 if (!TRI->hasAGPRs(RC))
15177 continue;
15178 auto *Src = MRI.getUniqueVRegDef(Op.getReg());
15179 if (!Src || !Src->isCopy() ||
15180 !TRI->isSGPRReg(MRI, Src->getOperand(1).getReg()))
15181 continue;
15182 auto *NewRC = TRI->getEquivalentVGPRClass(RC);
15183 // All uses of agpr64 and agpr32 can also accept vgpr except for
15184 // v_accvgpr_read, but we do not produce agpr reads during selection,
15185 // so no use checks are needed.
15186 MRI.setRegClass(Op.getReg(), NewRC);
15187 }
15188
15189 if (!HasAGPRs)
15190 return;
15191
15192 // Resolve the rest of AV operands to AGPRs.
15193 if (auto *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2)) {
15194 if (Src2->isReg() && Src2->getReg().isVirtual()) {
15195 auto *RC = TRI->getRegClassForReg(MRI, Src2->getReg());
15196 if (TRI->isVectorSuperClass(RC)) {
15197 auto *NewRC = TRI->getEquivalentAGPRClass(RC);
15198 MRI.setRegClass(Src2->getReg(), NewRC);
15199 if (Src2->isTied())
15200 MRI.setRegClass(MI.getOperand(0).getReg(), NewRC);
15201 }
15202 }
15203 }
15204 }
15205
15206 return;
15207 }
15208
15209 if (TII->isImage(MI))
15210 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::vaddr);
15211}
15212
15214 uint64_t Val) {
15215 SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
15216 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
15217}
15218
15220 const SDLoc &DL,
15221 SDValue Ptr) const {
15223
15224 // Build the half of the subregister with the constants before building the
15225 // full 128-bit register. If we are building multiple resource descriptors,
15226 // this will allow CSEing of the 2-component register.
15227 const SDValue Ops0[] = {
15228 DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
15229 buildSMovImm32(DAG, DL, 0),
15230 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
15231 buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
15232 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)
15233 };
15234
15235 SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL,
15236 MVT::v2i32, Ops0), 0);
15237
15238 // Combine the constants and the pointer.
15239 const SDValue Ops1[] = {
15240 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32),
15241 Ptr,
15242 DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32),
15243 SubRegHi,
15244 DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)
15245 };
15246
15247 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
15248}
15249
15250/// Return a resource descriptor with the 'Add TID' bit enabled
15251/// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
15252/// of the resource descriptor) to create an offset, which is added to
15253/// the resource pointer.
15255 SDValue Ptr, uint32_t RsrcDword1,
15256 uint64_t RsrcDword2And3) const {
15257 SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
15258 SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
15259 if (RsrcDword1) {
15260 PtrHi = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
15261 DAG.getConstant(RsrcDword1, DL, MVT::i32)),
15262 0);
15263 }
15264
15265 SDValue DataLo = buildSMovImm32(DAG, DL,
15266 RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
15267 SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
15268
15269 const SDValue Ops[] = {
15270 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32),
15271 PtrLo,
15272 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
15273 PtrHi,
15274 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
15275 DataLo,
15276 DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
15277 DataHi,
15278 DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)
15279 };
15280
15281 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
15282}
15283
15284//===----------------------------------------------------------------------===//
15285// SI Inline Assembly Support
15286//===----------------------------------------------------------------------===//
15287
15288std::pair<unsigned, const TargetRegisterClass *>
15290 StringRef Constraint,
15291 MVT VT) const {
15292 const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(TRI_);
15293
15294 const TargetRegisterClass *RC = nullptr;
15295 if (Constraint.size() == 1) {
15296 const unsigned BitWidth = VT.getSizeInBits();
15297 switch (Constraint[0]) {
15298 default:
15299 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
15300 case 's':
15301 case 'r':
15302 switch (BitWidth) {
15303 case 16:
15304 RC = &AMDGPU::SReg_32RegClass;
15305 break;
15306 case 64:
15307 RC = &AMDGPU::SGPR_64RegClass;
15308 break;
15309 default:
15311 if (!RC)
15312 return std::pair(0U, nullptr);
15313 break;
15314 }
15315 break;
15316 case 'v':
15317 switch (BitWidth) {
15318 case 16:
15319 RC = &AMDGPU::VGPR_32RegClass;
15320 break;
15321 default:
15322 RC = TRI->getVGPRClassForBitWidth(BitWidth);
15323 if (!RC)
15324 return std::pair(0U, nullptr);
15325 break;
15326 }
15327 break;
15328 case 'a':
15329 if (!Subtarget->hasMAIInsts())
15330 break;
15331 switch (BitWidth) {
15332 case 16:
15333 RC = &AMDGPU::AGPR_32RegClass;
15334 break;
15335 default:
15336 RC = TRI->getAGPRClassForBitWidth(BitWidth);
15337 if (!RC)
15338 return std::pair(0U, nullptr);
15339 break;
15340 }
15341 break;
15342 }
15343 // We actually support i128, i16 and f16 as inline parameters
15344 // even if they are not reported as legal
15345 if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
15346 VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
15347 return std::pair(0U, RC);
15348 }
15349
15350 if (Constraint.starts_with("{") && Constraint.ends_with("}")) {
15351 StringRef RegName(Constraint.data() + 1, Constraint.size() - 2);
15352 if (RegName.consume_front("v")) {
15353 RC = &AMDGPU::VGPR_32RegClass;
15354 } else if (RegName.consume_front("s")) {
15355 RC = &AMDGPU::SGPR_32RegClass;
15356 } else if (RegName.consume_front("a")) {
15357 RC = &AMDGPU::AGPR_32RegClass;
15358 }
15359
15360 if (RC) {
15361 uint32_t Idx;
15362 if (RegName.consume_front("[")) {
15363 uint32_t End;
15364 bool Failed = RegName.consumeInteger(10, Idx);
15365 Failed |= !RegName.consume_front(":");
15366 Failed |= RegName.consumeInteger(10, End);
15367 Failed |= !RegName.consume_back("]");
15368 if (!Failed) {
15369 uint32_t Width = (End - Idx + 1) * 32;
15370 MCRegister Reg = RC->getRegister(Idx);
15372 RC = TRI->getVGPRClassForBitWidth(Width);
15373 else if (SIRegisterInfo::isSGPRClass(RC))
15374 RC = TRI->getSGPRClassForBitWidth(Width);
15375 else if (SIRegisterInfo::isAGPRClass(RC))
15376 RC = TRI->getAGPRClassForBitWidth(Width);
15377 if (RC) {
15378 Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
15379 return std::pair(Reg, RC);
15380 }
15381 }
15382 } else {
15383 bool Failed = RegName.getAsInteger(10, Idx);
15384 if (!Failed && Idx < RC->getNumRegs())
15385 return std::pair(RC->getRegister(Idx), RC);
15386 }
15387 }
15388 }
15389
15390 auto Ret = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
15391 if (Ret.first)
15392 Ret.second = TRI->getPhysRegBaseClass(Ret.first);
15393
15394 return Ret;
15395}
15396
15397static bool isImmConstraint(StringRef Constraint) {
15398 if (Constraint.size() == 1) {
15399 switch (Constraint[0]) {
15400 default: break;
15401 case 'I':
15402 case 'J':
15403 case 'A':
15404 case 'B':
15405 case 'C':
15406 return true;
15407 }
15408 } else if (Constraint == "DA" ||
15409 Constraint == "DB") {
15410 return true;
15411 }
15412 return false;
15413}
15414
15417 if (Constraint.size() == 1) {
15418 switch (Constraint[0]) {
15419 default: break;
15420 case 's':
15421 case 'v':
15422 case 'a':
15423 return C_RegisterClass;
15424 }
15425 }
15426 if (isImmConstraint(Constraint)) {
15427 return C_Other;
15428 }
15429 return TargetLowering::getConstraintType(Constraint);
15430}
15431
15432static uint64_t clearUnusedBits(uint64_t Val, unsigned Size) {
15434 Val = Val & maskTrailingOnes<uint64_t>(Size);
15435 }
15436 return Val;
15437}
15438
15440 StringRef Constraint,
15441 std::vector<SDValue> &Ops,
15442 SelectionDAG &DAG) const {
15443 if (isImmConstraint(Constraint)) {
15444 uint64_t Val;
15445 if (getAsmOperandConstVal(Op, Val) &&
15446 checkAsmConstraintVal(Op, Constraint, Val)) {
15447 Val = clearUnusedBits(Val, Op.getScalarValueSizeInBits());
15448 Ops.push_back(DAG.getTargetConstant(Val, SDLoc(Op), MVT::i64));
15449 }
15450 } else {
15451 TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
15452 }
15453}
15454
15456 unsigned Size = Op.getScalarValueSizeInBits();
15457 if (Size > 64)
15458 return false;
15459
15460 if (Size == 16 && !Subtarget->has16BitInsts())
15461 return false;
15462
15463 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
15464 Val = C->getSExtValue();
15465 return true;
15466 }
15467 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op)) {
15468 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
15469 return true;
15470 }
15471 if (BuildVectorSDNode *V = dyn_cast<BuildVectorSDNode>(Op)) {
15472 if (Size != 16 || Op.getNumOperands() != 2)
15473 return false;
15474 if (Op.getOperand(0).isUndef() || Op.getOperand(1).isUndef())
15475 return false;
15476 if (ConstantSDNode *C = V->getConstantSplatNode()) {
15477 Val = C->getSExtValue();
15478 return true;
15479 }
15480 if (ConstantFPSDNode *C = V->getConstantFPSplatNode()) {
15481 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
15482 return true;
15483 }
15484 }
15485
15486 return false;
15487}
15488
15490 uint64_t Val) const {
15491 if (Constraint.size() == 1) {
15492 switch (Constraint[0]) {
15493 case 'I':
15495 case 'J':
15496 return isInt<16>(Val);
15497 case 'A':
15498 return checkAsmConstraintValA(Op, Val);
15499 case 'B':
15500 return isInt<32>(Val);
15501 case 'C':
15502 return isUInt<32>(clearUnusedBits(Val, Op.getScalarValueSizeInBits())) ||
15504 default:
15505 break;
15506 }
15507 } else if (Constraint.size() == 2) {
15508 if (Constraint == "DA") {
15509 int64_t HiBits = static_cast<int32_t>(Val >> 32);
15510 int64_t LoBits = static_cast<int32_t>(Val);
15511 return checkAsmConstraintValA(Op, HiBits, 32) &&
15512 checkAsmConstraintValA(Op, LoBits, 32);
15513 }
15514 if (Constraint == "DB") {
15515 return true;
15516 }
15517 }
15518 llvm_unreachable("Invalid asm constraint");
15519}
15520
15522 unsigned MaxSize) const {
15523 unsigned Size = std::min<unsigned>(Op.getScalarValueSizeInBits(), MaxSize);
15524 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
15525 if (Size == 16) {
15526 MVT VT = Op.getSimpleValueType();
15527 switch (VT.SimpleTy) {
15528 default:
15529 return false;
15530 case MVT::i16:
15531 return AMDGPU::isInlinableLiteralI16(Val, HasInv2Pi);
15532 case MVT::f16:
15533 return AMDGPU::isInlinableLiteralFP16(Val, HasInv2Pi);
15534 case MVT::bf16:
15535 return AMDGPU::isInlinableLiteralBF16(Val, HasInv2Pi);
15536 case MVT::v2i16:
15537 return AMDGPU::getInlineEncodingV2I16(Val).has_value();
15538 case MVT::v2f16:
15539 return AMDGPU::getInlineEncodingV2F16(Val).has_value();
15540 case MVT::v2bf16:
15541 return AMDGPU::getInlineEncodingV2BF16(Val).has_value();
15542 }
15543 }
15544 if ((Size == 32 && AMDGPU::isInlinableLiteral32(Val, HasInv2Pi)) ||
15545 (Size == 64 && AMDGPU::isInlinableLiteral64(Val, HasInv2Pi)))
15546 return true;
15547 return false;
15548}
15549
15550static int getAlignedAGPRClassID(unsigned UnalignedClassID) {
15551 switch (UnalignedClassID) {
15552 case AMDGPU::VReg_64RegClassID:
15553 return AMDGPU::VReg_64_Align2RegClassID;
15554 case AMDGPU::VReg_96RegClassID:
15555 return AMDGPU::VReg_96_Align2RegClassID;
15556 case AMDGPU::VReg_128RegClassID:
15557 return AMDGPU::VReg_128_Align2RegClassID;
15558 case AMDGPU::VReg_160RegClassID:
15559 return AMDGPU::VReg_160_Align2RegClassID;
15560 case AMDGPU::VReg_192RegClassID:
15561 return AMDGPU::VReg_192_Align2RegClassID;
15562 case AMDGPU::VReg_224RegClassID:
15563 return AMDGPU::VReg_224_Align2RegClassID;
15564 case AMDGPU::VReg_256RegClassID:
15565 return AMDGPU::VReg_256_Align2RegClassID;
15566 case AMDGPU::VReg_288RegClassID:
15567 return AMDGPU::VReg_288_Align2RegClassID;
15568 case AMDGPU::VReg_320RegClassID:
15569 return AMDGPU::VReg_320_Align2RegClassID;
15570 case AMDGPU::VReg_352RegClassID:
15571 return AMDGPU::VReg_352_Align2RegClassID;
15572 case AMDGPU::VReg_384RegClassID:
15573 return AMDGPU::VReg_384_Align2RegClassID;
15574 case AMDGPU::VReg_512RegClassID:
15575 return AMDGPU::VReg_512_Align2RegClassID;
15576 case AMDGPU::VReg_1024RegClassID:
15577 return AMDGPU::VReg_1024_Align2RegClassID;
15578 case AMDGPU::AReg_64RegClassID:
15579 return AMDGPU::AReg_64_Align2RegClassID;
15580 case AMDGPU::AReg_96RegClassID:
15581 return AMDGPU::AReg_96_Align2RegClassID;
15582 case AMDGPU::AReg_128RegClassID:
15583 return AMDGPU::AReg_128_Align2RegClassID;
15584 case AMDGPU::AReg_160RegClassID:
15585 return AMDGPU::AReg_160_Align2RegClassID;
15586 case AMDGPU::AReg_192RegClassID:
15587 return AMDGPU::AReg_192_Align2RegClassID;
15588 case AMDGPU::AReg_256RegClassID:
15589 return AMDGPU::AReg_256_Align2RegClassID;
15590 case AMDGPU::AReg_512RegClassID:
15591 return AMDGPU::AReg_512_Align2RegClassID;
15592 case AMDGPU::AReg_1024RegClassID:
15593 return AMDGPU::AReg_1024_Align2RegClassID;
15594 default:
15595 return -1;
15596 }
15597}
15598
15599// Figure out which registers should be reserved for stack access. Only after
15600// the function is legalized do we know all of the non-spill stack objects or if
15601// calls are present.
15605 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
15606 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
15607 const SIInstrInfo *TII = ST.getInstrInfo();
15608
15609 if (Info->isEntryFunction()) {
15610 // Callable functions have fixed registers used for stack access.
15612 }
15613
15614 // TODO: Move this logic to getReservedRegs()
15615 // Reserve the SGPR(s) to save/restore EXEC for WWM spill/copy handling.
15616 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
15617 Register SReg = ST.isWave32()
15618 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
15619 : TRI->getAlignedHighSGPRForRC(MF, /*Align=*/2,
15620 &AMDGPU::SGPR_64RegClass);
15621 Info->setSGPRForEXECCopy(SReg);
15622
15623 assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
15624 Info->getStackPtrOffsetReg()));
15625 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
15626 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
15627
15628 // We need to worry about replacing the default register with itself in case
15629 // of MIR testcases missing the MFI.
15630 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
15631 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
15632
15633 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
15634 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
15635
15636 Info->limitOccupancy(MF);
15637
15638 if (ST.isWave32() && !MF.empty()) {
15639 for (auto &MBB : MF) {
15640 for (auto &MI : MBB) {
15641 TII->fixImplicitOperands(MI);
15642 }
15643 }
15644 }
15645
15646 // FIXME: This is a hack to fixup AGPR classes to use the properly aligned
15647 // classes if required. Ideally the register class constraints would differ
15648 // per-subtarget, but there's no easy way to achieve that right now. This is
15649 // not a problem for VGPRs because the correctly aligned VGPR class is implied
15650 // from using them as the register class for legal types.
15651 if (ST.needsAlignedVGPRs()) {
15652 for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
15653 const Register Reg = Register::index2VirtReg(I);
15654 const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
15655 if (!RC)
15656 continue;
15657 int NewClassID = getAlignedAGPRClassID(RC->getID());
15658 if (NewClassID != -1)
15659 MRI.setRegClass(Reg, TRI->getRegClass(NewClassID));
15660 }
15661 }
15662
15664}
15665
15667 KnownBits &Known,
15668 const APInt &DemandedElts,
15669 const SelectionDAG &DAG,
15670 unsigned Depth) const {
15671 Known.resetAll();
15672 unsigned Opc = Op.getOpcode();
15673 switch (Opc) {
15675 unsigned IID = Op.getConstantOperandVal(0);
15676 switch (IID) {
15677 case Intrinsic::amdgcn_mbcnt_lo:
15678 case Intrinsic::amdgcn_mbcnt_hi: {
15679 const GCNSubtarget &ST =
15681 // These return at most the (wavefront size - 1) + src1
15682 // As long as src1 is an immediate we can calc known bits
15683 KnownBits Src1Known = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
15684 unsigned Src1ValBits = Src1Known.countMaxActiveBits();
15685 unsigned MaxActiveBits = std::max(Src1ValBits, ST.getWavefrontSizeLog2());
15686 // Cater for potential carry
15687 MaxActiveBits += Src1ValBits ? 1 : 0;
15688 unsigned Size = Op.getValueType().getSizeInBits();
15689 if (MaxActiveBits < Size)
15690 Known.Zero.setHighBits(Size - MaxActiveBits);
15691 return;
15692 }
15693 }
15694 break;
15695 }
15696 }
15698 Op, Known, DemandedElts, DAG, Depth);
15699}
15700
15702 const int FI, KnownBits &Known, const MachineFunction &MF) const {
15704
15705 // Set the high bits to zero based on the maximum allowed scratch size per
15706 // wave. We can't use vaddr in MUBUF instructions if we don't know the address
15707 // calculation won't overflow, so assume the sign bit is never set.
15708 Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
15709}
15710
15712 KnownBits &Known, unsigned Dim) {
15713 unsigned MaxValue =
15714 ST.getMaxWorkitemID(KB.getMachineFunction().getFunction(), Dim);
15715 Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
15716}
15717
15719 GISelKnownBits &KB, Register R, KnownBits &Known, const APInt &DemandedElts,
15720 const MachineRegisterInfo &MRI, unsigned Depth) const {
15721 const MachineInstr *MI = MRI.getVRegDef(R);
15722 switch (MI->getOpcode()) {
15723 case AMDGPU::G_INTRINSIC:
15724 case AMDGPU::G_INTRINSIC_CONVERGENT: {
15725 switch (cast<GIntrinsic>(MI)->getIntrinsicID()) {
15726 case Intrinsic::amdgcn_workitem_id_x:
15727 knownBitsForWorkitemID(*getSubtarget(), KB, Known, 0);
15728 break;
15729 case Intrinsic::amdgcn_workitem_id_y:
15730 knownBitsForWorkitemID(*getSubtarget(), KB, Known, 1);
15731 break;
15732 case Intrinsic::amdgcn_workitem_id_z:
15733 knownBitsForWorkitemID(*getSubtarget(), KB, Known, 2);
15734 break;
15735 case Intrinsic::amdgcn_mbcnt_lo:
15736 case Intrinsic::amdgcn_mbcnt_hi: {
15737 // These return at most the wavefront size - 1.
15738 unsigned Size = MRI.getType(R).getSizeInBits();
15739 Known.Zero.setHighBits(Size - getSubtarget()->getWavefrontSizeLog2());
15740 break;
15741 }
15742 case Intrinsic::amdgcn_groupstaticsize: {
15743 // We can report everything over the maximum size as 0. We can't report
15744 // based on the actual size because we don't know if it's accurate or not
15745 // at any given point.
15746 Known.Zero.setHighBits(
15747 llvm::countl_zero(getSubtarget()->getAddressableLocalMemorySize()));
15748 break;
15749 }
15750 }
15751 break;
15752 }
15753 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
15754 Known.Zero.setHighBits(24);
15755 break;
15756 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
15757 Known.Zero.setHighBits(16);
15758 break;
15759 case AMDGPU::G_AMDGPU_SMED3:
15760 case AMDGPU::G_AMDGPU_UMED3: {
15761 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
15762
15763 KnownBits Known2;
15764 KB.computeKnownBitsImpl(Src2, Known2, DemandedElts, Depth + 1);
15765 if (Known2.isUnknown())
15766 break;
15767
15768 KnownBits Known1;
15769 KB.computeKnownBitsImpl(Src1, Known1, DemandedElts, Depth + 1);
15770 if (Known1.isUnknown())
15771 break;
15772
15773 KnownBits Known0;
15774 KB.computeKnownBitsImpl(Src0, Known0, DemandedElts, Depth + 1);
15775 if (Known0.isUnknown())
15776 break;
15777
15778 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
15779 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
15780 Known.One = Known0.One & Known1.One & Known2.One;
15781 break;
15782 }
15783 }
15784}
15785
15788 unsigned Depth) const {
15789 const MachineInstr *MI = MRI.getVRegDef(R);
15790 if (auto *GI = dyn_cast<GIntrinsic>(MI)) {
15791 // FIXME: Can this move to generic code? What about the case where the call
15792 // site specifies a lower alignment?
15793 Intrinsic::ID IID = GI->getIntrinsicID();
15795 AttributeList Attrs = Intrinsic::getAttributes(Ctx, IID);
15796 if (MaybeAlign RetAlign = Attrs.getRetAlignment())
15797 return *RetAlign;
15798 }
15799 return Align(1);
15800}
15801
15804 const Align CacheLineAlign = Align(64);
15805
15806 // Pre-GFX10 target did not benefit from loop alignment
15807 if (!ML || DisableLoopAlignment || !getSubtarget()->hasInstPrefetch() ||
15808 getSubtarget()->hasInstFwdPrefetchBug())
15809 return PrefAlign;
15810
15811 // On GFX10 I$ is 4 x 64 bytes cache lines.
15812 // By default prefetcher keeps one cache line behind and reads two ahead.
15813 // We can modify it with S_INST_PREFETCH for larger loops to have two lines
15814 // behind and one ahead.
15815 // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
15816 // If loop fits 64 bytes it always spans no more than two cache lines and
15817 // does not need an alignment.
15818 // Else if loop is less or equal 128 bytes we do not need to modify prefetch,
15819 // Else if loop is less or equal 192 bytes we need two lines behind.
15820
15822 const MachineBasicBlock *Header = ML->getHeader();
15823 if (Header->getAlignment() != PrefAlign)
15824 return Header->getAlignment(); // Already processed.
15825
15826 unsigned LoopSize = 0;
15827 for (const MachineBasicBlock *MBB : ML->blocks()) {
15828 // If inner loop block is aligned assume in average half of the alignment
15829 // size to be added as nops.
15830 if (MBB != Header)
15831 LoopSize += MBB->getAlignment().value() / 2;
15832
15833 for (const MachineInstr &MI : *MBB) {
15834 LoopSize += TII->getInstSizeInBytes(MI);
15835 if (LoopSize > 192)
15836 return PrefAlign;
15837 }
15838 }
15839
15840 if (LoopSize <= 64)
15841 return PrefAlign;
15842
15843 if (LoopSize <= 128)
15844 return CacheLineAlign;
15845
15846 // If any of parent loops is surrounded by prefetch instructions do not
15847 // insert new for inner loop, which would reset parent's settings.
15848 for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) {
15849 if (MachineBasicBlock *Exit = P->getExitBlock()) {
15850 auto I = Exit->getFirstNonDebugInstr();
15851 if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH)
15852 return CacheLineAlign;
15853 }
15854 }
15855
15856 MachineBasicBlock *Pre = ML->getLoopPreheader();
15857 MachineBasicBlock *Exit = ML->getExitBlock();
15858
15859 if (Pre && Exit) {
15860 auto PreTerm = Pre->getFirstTerminator();
15861 if (PreTerm == Pre->begin() ||
15862 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
15863 BuildMI(*Pre, PreTerm, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
15864 .addImm(1); // prefetch 2 lines behind PC
15865
15866 auto ExitHead = Exit->getFirstNonDebugInstr();
15867 if (ExitHead == Exit->end() ||
15868 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
15869 BuildMI(*Exit, ExitHead, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
15870 .addImm(2); // prefetch 1 line behind PC
15871 }
15872
15873 return CacheLineAlign;
15874}
15875
15877static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
15878 assert(N->getOpcode() == ISD::CopyFromReg);
15879 do {
15880 // Follow the chain until we find an INLINEASM node.
15881 N = N->getOperand(0).getNode();
15882 if (N->getOpcode() == ISD::INLINEASM ||
15883 N->getOpcode() == ISD::INLINEASM_BR)
15884 return true;
15885 } while (N->getOpcode() == ISD::CopyFromReg);
15886 return false;
15887}
15888
15891 UniformityInfo *UA) const {
15892 switch (N->getOpcode()) {
15893 case ISD::CopyFromReg: {
15894 const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1));
15895 const MachineRegisterInfo &MRI = FLI->MF->getRegInfo();
15896 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
15897 Register Reg = R->getReg();
15898
15899 // FIXME: Why does this need to consider isLiveIn?
15900 if (Reg.isPhysical() || MRI.isLiveIn(Reg))
15901 return !TRI->isSGPRReg(MRI, Reg);
15902
15903 if (const Value *V = FLI->getValueFromVirtualReg(R->getReg()))
15904 return UA->isDivergent(V);
15905
15907 return !TRI->isSGPRReg(MRI, Reg);
15908 }
15909 case ISD::LOAD: {
15910 const LoadSDNode *L = cast<LoadSDNode>(N);
15911 unsigned AS = L->getAddressSpace();
15912 // A flat load may access private memory.
15914 }
15915 case ISD::CALLSEQ_END:
15916 return true;
15918 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(0));
15920 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(1));
15942 // Target-specific read-modify-write atomics are sources of divergence.
15943 return true;
15944 default:
15945 if (auto *A = dyn_cast<AtomicSDNode>(N)) {
15946 // Generic read-modify-write atomics are sources of divergence.
15947 return A->readMem() && A->writeMem();
15948 }
15949 return false;
15950 }
15951}
15952
15954 EVT VT) const {
15955 switch (VT.getScalarType().getSimpleVT().SimpleTy) {
15956 case MVT::f32:
15958 case MVT::f64:
15959 case MVT::f16:
15961 default:
15962 return false;
15963 }
15964}
15965
15967 LLT Ty, const MachineFunction &MF) const {
15968 switch (Ty.getScalarSizeInBits()) {
15969 case 32:
15970 return !denormalModeIsFlushAllF32(MF);
15971 case 64:
15972 case 16:
15973 return !denormalModeIsFlushAllF64F16(MF);
15974 default:
15975 return false;
15976 }
15977}
15978
15980 const SelectionDAG &DAG,
15981 bool SNaN,
15982 unsigned Depth) const {
15983 if (Op.getOpcode() == AMDGPUISD::CLAMP) {
15984 const MachineFunction &MF = DAG.getMachineFunction();
15986
15987 if (Info->getMode().DX10Clamp)
15988 return true; // Clamped to 0.
15989 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
15990 }
15991
15993 SNaN, Depth);
15994}
15995
15996#if 0
15997// FIXME: This should be checked before unsafe fp atomics are enabled
15998// Global FP atomic instructions have a hardcoded FP mode and do not support
15999// FP32 denormals, and only support v2f16 denormals.
16000static bool fpModeMatchesGlobalFPAtomicMode(const AtomicRMWInst *RMW) {
16002 auto DenormMode = RMW->getParent()->getParent()->getDenormalMode(Flt);
16003 if (&Flt == &APFloat::IEEEsingle())
16004 return DenormMode == DenormalMode::getPreserveSign();
16005 return DenormMode == DenormalMode::getIEEE();
16006}
16007#endif
16008
16009// The amdgpu-unsafe-fp-atomics attribute enables generation of unsafe
16010// floating point atomic instructions. May generate more efficient code,
16011// but may not respect rounding and denormal modes, and may give incorrect
16012// results for certain memory destinations.
16014 return F->getFnAttribute("amdgpu-unsafe-fp-atomics").getValueAsString() !=
16015 "true";
16016}
16017
16019 LLVMContext &Ctx = RMW->getContext();
16021 Ctx.getSyncScopeNames(SSNs);
16022 StringRef MemScope = SSNs[RMW->getSyncScopeID()].empty()
16023 ? "system"
16024 : SSNs[RMW->getSyncScopeID()];
16025
16026 return OptimizationRemark(DEBUG_TYPE, "Passed", RMW)
16027 << "Hardware instruction generated for atomic "
16028 << RMW->getOperationName(RMW->getOperation())
16029 << " operation at memory scope " << MemScope;
16030}
16031
16034 unsigned AS = RMW->getPointerAddressSpace();
16035 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
16037
16038 auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
16040 ORE.emit([=]() {
16041 return emitAtomicRMWLegalRemark(RMW) << " due to an unsafe request.";
16042 });
16043 return Kind;
16044 };
16045
16046 auto SSID = RMW->getSyncScopeID();
16047 bool HasSystemScope =
16048 SSID == SyncScope::System ||
16049 SSID == RMW->getContext().getOrInsertSyncScopeID("one-as");
16050
16051 switch (RMW->getOperation()) {
16052 case AtomicRMWInst::Sub:
16053 case AtomicRMWInst::Or:
16054 case AtomicRMWInst::Xor: {
16055 // Atomic sub/or/xor do not work over PCI express, but atomic add
16056 // does. InstCombine transforms these with 0 to or, so undo that.
16057 if (HasSystemScope && AMDGPU::isFlatGlobalAddrSpace(AS)) {
16058 if (Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand());
16059 ConstVal && ConstVal->isNullValue())
16061 }
16062
16063 break;
16064 }
16065 case AtomicRMWInst::FAdd: {
16066 Type *Ty = RMW->getType();
16067
16068 // TODO: Handle REGION_ADDRESS
16069 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
16070 // DS F32 FP atomics do respect the denormal mode, but the rounding mode
16071 // is fixed to round-to-nearest-even.
16072 //
16073 // F64 / PK_F16 / PK_BF16 never flush and are also fixed to
16074 // round-to-nearest-even.
16075 //
16076 // We ignore the rounding mode problem, even in strictfp. The C++ standard
16077 // suggests it is OK if the floating-point mode may not match the calling
16078 // thread.
16079 if (Ty->isFloatTy()) {
16082 }
16083
16084 if (Ty->isDoubleTy()) {
16085 // Ignores denormal mode, but we don't consider flushing mandatory.
16088 }
16089
16090 // TODO: Handle v2f16/v2bf16 cases for gfx940
16092 }
16093
16097
16098 // TODO: gfx940 supports v2f16 and v2bf16
16099 if (Subtarget->hasGFX940Insts() && (Ty->isFloatTy() || Ty->isDoubleTy()))
16101
16104
16105 // Always expand system scope fp atomics.
16106 if (HasSystemScope)
16108
16109 // global and flat atomic fadd f64: gfx90a, gfx940.
16110 if (Subtarget->hasGFX90AInsts() && Ty->isDoubleTy())
16111 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16112
16113 if (AS != AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
16114 // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940, gfx11+.
16115 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16116 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16117 // global/buffer atomic fadd f32 rtn: gfx90a, gfx940, gfx11+.
16118 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16119 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16120 }
16121
16122 // flat atomic fadd f32: gfx940, gfx11+.
16123 if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
16124 if (Subtarget->hasFlatAtomicFaddF32Inst())
16125 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16126
16127 // If it is in flat address space, and the type is float, we will try to
16128 // expand it, if the target supports global and lds atomic fadd. The
16129 // reason we need that is, in the expansion, we emit the check of address
16130 // space. If it is in global address space, we emit the global atomic
16131 // fadd; if it is in shared address space, we emit the LDS atomic fadd.
16132 if (Subtarget->hasLDSFPAtomicAddF32()) {
16133 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16135 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16137 }
16138 }
16139
16141 }
16144 case AtomicRMWInst::Min:
16145 case AtomicRMWInst::Max:
16147 case AtomicRMWInst::UMax: {
16150 if (RMW->getType()->isFloatTy() &&
16153
16154 // Always expand system scope min/max atomics.
16155 if (HasSystemScope)
16157 }
16158 break;
16159 }
16160 default:
16161 break;
16162 }
16163
16165}
16166
16172}
16173
16176 return SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
16179}
16180
16186}
16187
16188const TargetRegisterClass *
16189SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
16191 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
16192 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
16193 return Subtarget->getWavefrontSize() == 64 ? &AMDGPU::SReg_64RegClass
16194 : &AMDGPU::SReg_32RegClass;
16195 if (!TRI->isSGPRClass(RC) && !isDivergent)
16196 return TRI->getEquivalentSGPRClass(RC);
16197 else if (TRI->isSGPRClass(RC) && isDivergent)
16198 return TRI->getEquivalentVGPRClass(RC);
16199
16200 return RC;
16201}
16202
16203// FIXME: This is a workaround for DivergenceAnalysis not understanding always
16204// uniform values (as produced by the mask results of control flow intrinsics)
16205// used outside of divergent blocks. The phi users need to also be treated as
16206// always uniform.
16207//
16208// FIXME: DA is no longer in-use. Does this still apply to UniformityAnalysis?
16209static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited,
16210 unsigned WaveSize) {
16211 // FIXME: We assume we never cast the mask results of a control flow
16212 // intrinsic.
16213 // Early exit if the type won't be consistent as a compile time hack.
16214 IntegerType *IT = dyn_cast<IntegerType>(V->getType());
16215 if (!IT || IT->getBitWidth() != WaveSize)
16216 return false;
16217
16218 if (!isa<Instruction>(V))
16219 return false;
16220 if (!Visited.insert(V).second)
16221 return false;
16222 bool Result = false;
16223 for (const auto *U : V->users()) {
16224 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(U)) {
16225 if (V == U->getOperand(1)) {
16226 switch (Intrinsic->getIntrinsicID()) {
16227 default:
16228 Result = false;
16229 break;
16230 case Intrinsic::amdgcn_if_break:
16231 case Intrinsic::amdgcn_if:
16232 case Intrinsic::amdgcn_else:
16233 Result = true;
16234 break;
16235 }
16236 }
16237 if (V == U->getOperand(0)) {
16238 switch (Intrinsic->getIntrinsicID()) {
16239 default:
16240 Result = false;
16241 break;
16242 case Intrinsic::amdgcn_end_cf:
16243 case Intrinsic::amdgcn_loop:
16244 Result = true;
16245 break;
16246 }
16247 }
16248 } else {
16249 Result = hasCFUser(U, Visited, WaveSize);
16250 }
16251 if (Result)
16252 break;
16253 }
16254 return Result;
16255}
16256
16258 const Value *V) const {
16259 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
16260 if (CI->isInlineAsm()) {
16261 // FIXME: This cannot give a correct answer. This should only trigger in
16262 // the case where inline asm returns mixed SGPR and VGPR results, used
16263 // outside the defining block. We don't have a specific result to
16264 // consider, so this assumes if any value is SGPR, the overall register
16265 // also needs to be SGPR.
16266 const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
16268 MF.getDataLayout(), Subtarget->getRegisterInfo(), *CI);
16269 for (auto &TC : TargetConstraints) {
16270 if (TC.Type == InlineAsm::isOutput) {
16273 SIRI, TC.ConstraintCode, TC.ConstraintVT).second;
16274 if (RC && SIRI->isSGPRClass(RC))
16275 return true;
16276 }
16277 }
16278 }
16279 }
16281 return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
16282}
16283
16285 SDNode::use_iterator I = N->use_begin(), E = N->use_end();
16286 for (; I != E; ++I) {
16287 if (MemSDNode *M = dyn_cast<MemSDNode>(*I)) {
16288 if (getBasePtrIndex(M) == I.getOperandNo())
16289 return true;
16290 }
16291 }
16292 return false;
16293}
16294
16296 SDValue N1) const {
16297 if (!N0.hasOneUse())
16298 return false;
16299 // Take care of the opportunity to keep N0 uniform
16300 if (N0->isDivergent() || !N1->isDivergent())
16301 return true;
16302 // Check if we have a good chance to form the memory access pattern with the
16303 // base and offset
16304 return (DAG.isBaseWithConstantOffset(N0) &&
16305 hasMemSDNodeUser(*N0->use_begin()));
16306}
16307
16309 Register N0, Register N1) const {
16310 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
16311}
16312
16315 // Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load.
16317 if (I.getMetadata("amdgpu.noclobber"))
16318 Flags |= MONoClobber;
16319 if (I.getMetadata("amdgpu.last.use"))
16320 Flags |= MOLastUse;
16321 return Flags;
16322}
16323
16325 SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI,
16326 const TargetInstrInfo *TII, unsigned &PhysReg, int &Cost) const {
16327 if (User->getOpcode() != ISD::CopyToReg)
16328 return false;
16329 if (!Def->isMachineOpcode())
16330 return false;
16331 MachineSDNode *MDef = dyn_cast<MachineSDNode>(Def);
16332 if (!MDef)
16333 return false;
16334
16335 unsigned ResNo = User->getOperand(Op).getResNo();
16336 if (User->getOperand(Op)->getValueType(ResNo) != MVT::i1)
16337 return false;
16338 const MCInstrDesc &II = TII->get(MDef->getMachineOpcode());
16339 if (II.isCompare() && II.hasImplicitDefOfPhysReg(AMDGPU::SCC)) {
16340 PhysReg = AMDGPU::SCC;
16341 const TargetRegisterClass *RC =
16342 TRI->getMinimalPhysRegClass(PhysReg, Def->getSimpleValueType(ResNo));
16343 Cost = RC->getCopyCost();
16344 return true;
16345 }
16346 return false;
16347}
16348
16351
16354 // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16355 assert(cast<Constant>(AI->getValOperand())->isNullValue() &&
16356 "this cannot be replaced with add");
16358 return;
16359 }
16360
16361 assert(Subtarget->hasAtomicFaddInsts() &&
16362 "target should have atomic fadd instructions");
16363 assert(AI->getType()->isFloatTy() &&
16365 "generic atomicrmw expansion only supports FP32 operand in flat "
16366 "address space");
16367 assert(Op == AtomicRMWInst::FAdd && "only fadd is supported for now");
16368
16369 // Given: atomicrmw fadd ptr %addr, float %val ordering
16370 //
16371 // With this expansion we produce the following code:
16372 // [...]
16373 // br label %atomicrmw.check.shared
16374 //
16375 // atomicrmw.check.shared:
16376 // %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %addr)
16377 // br i1 %is.shared, label %atomicrmw.shared, label %atomicrmw.check.private
16378 //
16379 // atomicrmw.shared:
16380 // %cast.shared = addrspacecast ptr %addr to ptr addrspace(3)
16381 // %loaded.shared = atomicrmw fadd ptr addrspace(3) %cast.shared,
16382 // float %val ordering
16383 // br label %atomicrmw.phi
16384 //
16385 // atomicrmw.check.private:
16386 // %is.private = call i1 @llvm.amdgcn.is.private(ptr %int8ptr)
16387 // br i1 %is.private, label %atomicrmw.private, label %atomicrmw.global
16388 //
16389 // atomicrmw.private:
16390 // %cast.private = addrspacecast ptr %addr to ptr addrspace(5)
16391 // %loaded.private = load float, ptr addrspace(5) %cast.private
16392 // %val.new = fadd float %loaded.private, %val
16393 // store float %val.new, ptr addrspace(5) %cast.private
16394 // br label %atomicrmw.phi
16395 //
16396 // atomicrmw.global:
16397 // %cast.global = addrspacecast ptr %addr to ptr addrspace(1)
16398 // %loaded.global = atomicrmw fadd ptr addrspace(1) %cast.global,
16399 // float %val ordering
16400 // br label %atomicrmw.phi
16401 //
16402 // atomicrmw.phi:
16403 // %loaded.phi = phi float [ %loaded.shared, %atomicrmw.shared ],
16404 // [ %loaded.private, %atomicrmw.private ],
16405 // [ %loaded.global, %atomicrmw.global ]
16406 // br label %atomicrmw.end
16407 //
16408 // atomicrmw.end:
16409 // [...]
16410
16411 IRBuilder<> Builder(AI);
16412 LLVMContext &Ctx = Builder.getContext();
16413
16414 BasicBlock *BB = Builder.GetInsertBlock();
16415 Function *F = BB->getParent();
16416 BasicBlock *ExitBB =
16417 BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
16418 BasicBlock *CheckSharedBB =
16419 BasicBlock::Create(Ctx, "atomicrmw.check.shared", F, ExitBB);
16420 BasicBlock *SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
16421 BasicBlock *CheckPrivateBB =
16422 BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
16423 BasicBlock *PrivateBB =
16424 BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB);
16425 BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
16426 BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB);
16427
16428 Value *Val = AI->getValOperand();
16429 Type *ValTy = Val->getType();
16430 Value *Addr = AI->getPointerOperand();
16431
16432 auto CreateNewAtomicRMW = [AI](IRBuilder<> &Builder, Value *Addr,
16433 Value *Val) -> Value * {
16434 AtomicRMWInst *OldVal =
16435 Builder.CreateAtomicRMW(AI->getOperation(), Addr, Val, AI->getAlign(),
16436 AI->getOrdering(), AI->getSyncScopeID());
16438 AI->getAllMetadata(MDs);
16439 for (auto &P : MDs)
16440 OldVal->setMetadata(P.first, P.second);
16441 return OldVal;
16442 };
16443
16444 std::prev(BB->end())->eraseFromParent();
16445 Builder.SetInsertPoint(BB);
16446 Builder.CreateBr(CheckSharedBB);
16447
16448 Builder.SetInsertPoint(CheckSharedBB);
16449 CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared, {},
16450 {Addr}, nullptr, "is.shared");
16451 Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
16452
16453 Builder.SetInsertPoint(SharedBB);
16454 Value *CastToLocal = Builder.CreateAddrSpaceCast(
16456 Value *LoadedShared = CreateNewAtomicRMW(Builder, CastToLocal, Val);
16457 Builder.CreateBr(PhiBB);
16458
16459 Builder.SetInsertPoint(CheckPrivateBB);
16460 CallInst *IsPrivate = Builder.CreateIntrinsic(
16461 Intrinsic::amdgcn_is_private, {}, {Addr}, nullptr, "is.private");
16462 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
16463
16464 Builder.SetInsertPoint(PrivateBB);
16465 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
16467 Value *LoadedPrivate =
16468 Builder.CreateLoad(ValTy, CastToPrivate, "loaded.private");
16469 Value *NewVal = Builder.CreateFAdd(LoadedPrivate, Val, "val.new");
16470 Builder.CreateStore(NewVal, CastToPrivate);
16471 Builder.CreateBr(PhiBB);
16472
16473 Builder.SetInsertPoint(GlobalBB);
16474 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
16476 Value *LoadedGlobal = CreateNewAtomicRMW(Builder, CastToGlobal, Val);
16477 Builder.CreateBr(PhiBB);
16478
16479 Builder.SetInsertPoint(PhiBB);
16480 PHINode *Loaded = Builder.CreatePHI(ValTy, 3, "loaded.phi");
16481 Loaded->addIncoming(LoadedShared, SharedBB);
16482 Loaded->addIncoming(LoadedPrivate, PrivateBB);
16483 Loaded->addIncoming(LoadedGlobal, GlobalBB);
16484 Builder.CreateBr(ExitBB);
16485
16486 AI->replaceAllUsesWith(Loaded);
16487 AI->eraseFromParent();
16488}
16489
16490LoadInst *
16492 IRBuilder<> Builder(AI);
16493 auto Order = AI->getOrdering();
16494
16495 // The optimization removes store aspect of the atomicrmw. Therefore, cache
16496 // must be flushed if the atomic ordering had a release semantics. This is
16497 // not necessary a fence, a release fence just coincides to do that flush.
16498 // Avoid replacing of an atomicrmw with a release semantics.
16499 if (isReleaseOrStronger(Order))
16500 return nullptr;
16501
16502 LoadInst *LI = Builder.CreateAlignedLoad(
16503 AI->getType(), AI->getPointerOperand(), AI->getAlign());
16504 LI->setAtomic(Order, AI->getSyncScopeID());
16505 LI->copyMetadata(*AI);
16506 LI->takeName(AI);
16507 AI->replaceAllUsesWith(LI);
16508 AI->eraseFromParent();
16509 return LI;
16510}
static bool isMul(MachineInstr *MI)
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
static unsigned getIntrinsicID(const SDNode *N)
static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls)
Return true if the calling convention is one that we can guarantee TCO for.
static bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
unsigned Intr
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static const LLT S32
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
basic Basic Alias true
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define LLVM_ATTRIBUTE_UNUSED
Definition: Compiler.h:203
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Addr
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool isSigned(unsigned int Opcode)
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
#define DEBUG_TYPE
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
static bool isUndef(ArrayRef< int > Mask)
IRTranslator LLVM IR MI
iv Induction Variable Users
Definition: IVUsers.cpp:48
static const unsigned MaxDepth
#define RegName(no)
static LVOptions Options
Definition: LVOptions.cpp:25
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
Contains matchers for matching SSA Machine Instructions.
unsigned const TargetRegisterInfo * TRI
LLVMContext & Context
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
const char LLVMTargetMachineRef TM
const SmallVectorImpl< MachineOperand > & Cond
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:39
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:57
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:51
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:45
#define FP_DENORM_FLUSH_NONE
Definition: SIDefines.h:1174
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
Definition: SIDefines.h:1171
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static EVT memVTFromLoadIntrData(Type *Ty, unsigned MaxNumLanes)
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static EVT memVTFromLoadIntrReturn(Type *Ty, unsigned MaxNumLanes)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
bool unsafeFPAtomicsDisabled(Function *F)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelKnownBits &KB, KnownBits &Known, unsigned Dim)
static LLVM_ATTRIBUTE_UNUSED bool isCopyFromRegOfInlineAsm(const SDNode *N)
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static unsigned getIdxEn(SDValue VIndex)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Interface definition for SIRegisterInfo.
raw_pwrite_stream & OS
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
LLVM IR instance of the generic uniformity analysis.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition: Value.cpp:469
static constexpr int Concat[]
Value * RHS
Value * LHS
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo)
static bool isUniformMMO(const MachineMemOperand *MMO)
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
bool hasMadMacF32Insts() const
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
bool hasMadMixInsts() const
unsigned getWavefrontSizeLog2() const
bool has16BitInsts() const
bool isAmdHsaOrMesa(const Function &F) const
bool hasFastFMAF32() const
bool hasTrigReducedRange() const
unsigned getWavefrontSize() const
bool hasInv2PiInlineImm() const
bool hasVOP3PInsts() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold=4)
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
Definition: APFloat.h:988
opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition: APFloat.cpp:5196
bool isNegative() const
Definition: APFloat.h:1295
APInt bitcastToAPInt() const
Definition: APFloat.h:1210
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
Definition: APFloat.h:1006
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition: APFloat.h:966
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition: APFloat.h:957
bool isInfinity() const
Definition: APFloat.h:1292
Class for arbitrary precision integers.
Definition: APInt.h:76
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition: APInt.h:1370
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition: APInt.h:236
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition: APInt.h:444
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition: APInt.h:1589
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:274
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition: APInt.h:1215
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition: APInt.h:1199
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160
An instruction that atomically checks whether a specified value is in a memory location,...
Definition: Instructions.h:539
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:684
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:748
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
Definition: Instructions.h:867
BinOp
This enumeration lists the possible modifications atomicrmw can make.
Definition: Instructions.h:760
@ Add
*p = old + v
Definition: Instructions.h:764
@ FAdd
*p = old + v
Definition: Instructions.h:785
@ Min
*p = old <signed v ? old : v
Definition: Instructions.h:778
@ Or
*p = old | v
Definition: Instructions.h:772
@ Sub
*p = old - v
Definition: Instructions.h:766
@ Xor
*p = old ^ v
Definition: Instructions.h:774
@ Max
*p = old >signed v ? old : v
Definition: Instructions.h:776
@ UMin
*p = old <unsigned v ? old : v
Definition: Instructions.h:782
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
Definition: Instructions.h:796
@ UMax
*p = old >unsigned v ? old : v
Definition: Instructions.h:780
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
Definition: Instructions.h:792
Value * getPointerOperand()
Definition: Instructions.h:910
void setOperation(BinOp Operation)
Definition: Instructions.h:861
BinOp getOperation() const
Definition: Instructions.h:845
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
Definition: Instructions.h:901
Value * getValOperand()
Definition: Instructions.h:914
static StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
Definition: Instructions.h:887
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:918
This is an SDNode representing atomic operations.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
MemoryEffects getMemoryEffects() const
Returns memory effects of the function.
LLVM Basic Block Representation.
Definition: BasicBlock.h:60
iterator end()
Definition: BasicBlock.h:443
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition: BasicBlock.h:199
BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
Definition: BasicBlock.cpp:570
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:206
BitVector & set()
Definition: BitVector.h:351
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
Definition: ByteProvider.h:30
static ByteProvider getConstantZero()
Definition: ByteProvider.h:73
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
Definition: ByteProvider.h:66
std::optional< ISelOp > Src
Definition: ByteProvider.h:57
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
bool isRegLoc() const
Register getLocReg() const
LocInfo getLocInfo() const
bool isMemLoc() const
int64_t getLocMemOffset() const
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1742
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
Definition: InstrTypes.h:1828
bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1687
unsigned arg_size() const
Definition: InstrTypes.h:1685
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:993
@ ICMP_NE
not equal
Definition: InstrTypes.h:1015
bool isSigned() const
Definition: InstrTypes.h:1265
bool isFPPredicate() const
Definition: InstrTypes.h:1122
bool isIntPredicate() const
Definition: InstrTypes.h:1123
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
Definition: Constants.h:80
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition: Constants.h:205
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
Definition: Constant.h:41
bool isNullValue() const
Return true if this is the value that would be returned by getNullValue.
Definition: Constants.cpp:90
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Definition: DataLayout.cpp:865
bool isBigEndian() const
Definition: DataLayout.h:239
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:504
A debug info location.
Definition: DebugLoc.h:33
Diagnostic information for unsupported feature in backend.
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Register DemoteRegister
DemoteRegister - if CanLowerReturn is false, DemoteRegister is a vreg allocated to hold a pointer to ...
const Value * getValueFromVirtualReg(Register Vreg)
This method is called from TargetLowerinInfo::isSDNodeSourceOfDivergence to get the Value correspondi...
Class to represent function types.
Definition: DerivedTypes.h:103
Type * getParamType(unsigned i) const
Parameter type accessors.
Definition: DerivedTypes.h:135
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition: Function.h:201
iterator_range< arg_iterator > args()
Definition: Function.h:838
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:263
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:356
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Definition: Function.cpp:738
bool hasPrefetch() const
Definition: GCNSubtarget.h:891
bool hasD16Images() const
Definition: GCNSubtarget.h:686
bool hasImageStoreD16Bug() const
bool hasUsableDivScaleConditionOutput() const
Condition output from div_scale is usable.
Definition: GCNSubtarget.h:464
bool hasUsableDSOffset() const
True if the offset field of DS instructions works as expected.
Definition: GCNSubtarget.h:455
bool hasDot7Insts() const
Definition: GCNSubtarget.h:785
bool hasApertureRegs() const
Definition: GCNSubtarget.h:584
bool hasFlatInstOffsets() const
Definition: GCNSubtarget.h:614
bool hasCompressedExport() const
Return true if the target's EXP instruction has the COMPR flag, which affects the meaning of the EN (...
bool hasGFX90AInsts() const
bool hasDLInsts() const
Definition: GCNSubtarget.h:755
bool hasBCNT(unsigned Size) const
Definition: GCNSubtarget.h:398
bool hasMAIInsts() const
Definition: GCNSubtarget.h:805
bool hasMultiDwordFlatScratchAddressing() const
Definition: GCNSubtarget.h:666
bool hasArchitectedSGPRs() const
bool hasDenormModeInst() const
Definition: GCNSubtarget.h:514
bool hasPrivEnabledTrap2NopBug() const
bool hasUnalignedDSAccessEnabled() const
Definition: GCNSubtarget.h:572
const SIInstrInfo * getInstrInfo() const override
Definition: GCNSubtarget.h:253
bool hasDot1Insts() const
Definition: GCNSubtarget.h:761
bool hasAtomicFaddRtnInsts() const
Definition: GCNSubtarget.h:827
Align getStackAlignment() const
Definition: GCNSubtarget.h:904
bool hasScalarSubwordLoads() const
Definition: GCNSubtarget.h:442
bool enableFlatScratch() const
Definition: GCNSubtarget.h:639
bool hasDwordx3LoadStores() const
bool hasFlatScrRegister() const
Definition: GCNSubtarget.h:610
bool supportsGetDoorbellID() const
Definition: GCNSubtarget.h:448
bool hasFlatAtomicFaddF32Inst() const
Definition: GCNSubtarget.h:843
bool hasKernargPreload() const
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:265
unsigned getMaxNumVGPRs(unsigned WavesPerEU) const
bool hasMad64_32() const
Definition: GCNSubtarget.h:731
bool useDS128() const
Definition: GCNSubtarget.h:524
bool hasLDSMisalignedBug() const
bool hasUserSGPRInit16Bug() const
TrapHandlerAbi getTrapHandlerAbi() const
Definition: GCNSubtarget.h:444
const SIFrameLowering * getFrameLowering() const override
Definition: GCNSubtarget.h:257
bool hasUnalignedScratchAccess() const
Definition: GCNSubtarget.h:576
bool hasRestrictedSOffset() const
bool hasMin3Max3_16() const
Definition: GCNSubtarget.h:414
bool hasIntClamp() const
Definition: GCNSubtarget.h:344
bool hasGFX10_AEncoding() const
bool hasPackedFP32Ops() const
Definition: GCNSubtarget.h:995
bool hasGFX940Insts() const
bool hasFullRate64Ops() const
Definition: GCNSubtarget.h:364
bool isTrapHandlerEnabled() const
Definition: GCNSubtarget.h:588
bool hasLDSFPAtomicAddF64() const
Definition: GCNSubtarget.h:965
bool hasFlatGlobalInsts() const
Definition: GCNSubtarget.h:618
bool getScalarizeGlobalBehavior() const
Definition: GCNSubtarget.h:917
bool hasScalarSMulU64() const
Definition: GCNSubtarget.h:720
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
Definition: GCNSubtarget.h:323
bool hasShaderCyclesHiLoRegisters() const
Definition: GCNSubtarget.h:871
bool hasFFBL() const
Definition: GCNSubtarget.h:402
bool hasNSAEncoding() const
bool hasSMemRealTime() const
Definition: GCNSubtarget.h:936
bool usePRTStrictNull() const
Definition: GCNSubtarget.h:546
bool hasMed3_16() const
Definition: GCNSubtarget.h:410
bool hasMovrel() const
Definition: GCNSubtarget.h:940
bool hasBFI() const
Definition: GCNSubtarget.h:390
bool hasUnalignedBufferAccessEnabled() const
Definition: GCNSubtarget.h:564
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
Definition: GCNSubtarget.h:331
bool hasImageGather4D16Bug() const
bool supportsMinMaxDenormModes() const
Definition: GCNSubtarget.h:509
bool hasFFBH() const
Definition: GCNSubtarget.h:406
bool hasAtomicFaddInsts() const
Definition: GCNSubtarget.h:823
unsigned getNSAMaxSize(bool HasSampler=false) const
bool hasAtomicFaddNoRtnInsts() const
Definition: GCNSubtarget.h:829
bool hasScalarDwordx3Loads() const
Definition: GCNSubtarget.h:954
bool hasLDSFPAtomicAddF32() const
Definition: GCNSubtarget.h:964
bool haveRoundOpsF64() const
Have v_trunc_f64, v_ceil_f64, v_rndne_f64.
Definition: GCNSubtarget.h:534
bool hasDot8Insts() const
Definition: GCNSubtarget.h:789
bool hasDS96AndDS128() const
Definition: GCNSubtarget.h:529
bool useFlatForGlobal() const
Definition: GCNSubtarget.h:518
Generation getGeneration() const
Definition: GCNSubtarget.h:304
bool hasScalarAddSub64() const
Definition: GCNSubtarget.h:718
bool hasUnpackedD16VMem() const
Definition: GCNSubtarget.h:722
bool hasAddr64() const
Definition: GCNSubtarget.h:368
bool hasIEEEMinMax() const
bool hasFmaMixInsts() const
Definition: GCNSubtarget.h:418
bool hasPackedTID() const
bool hasAddNoCarry() const
Definition: GCNSubtarget.h:714
bool hasFractBug() const
Definition: GCNSubtarget.h:382
bool hasGDS() const
bool hasBFE() const
Definition: GCNSubtarget.h:386
bool hasGWSAutoReplay() const
Definition: GCNSubtarget.h:701
bool hasKernargSegmentPtr() const
bool hasPrivateSegmentBuffer() const
bool hasImplicitBufferPtr() const
virtual void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
const MachineFunction & getMachineFunction() const
bool isDivergent(ConstValueRefT V) const
Whether V is divergent at its definition.
unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
Definition: GlobalValue.h:511
unsigned getAddressSpace() const
Definition: GlobalValue.h:205
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:656
Type * getValueType() const
Definition: GlobalValue.h:296
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition: IRBuilder.h:1807
Value * CreateFAdd(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition: IRBuilder.h:1533
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:932
BasicBlock::iterator GetInsertPoint() const
Definition: IRBuilder.h:175
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:174
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:2397
BranchInst * CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a conditional 'br Cond, TrueDest, FalseDest' instruction.
Definition: IRBuilder.h:1120
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition: IRBuilder.h:1790
LLVMContext & getContext() const
Definition: IRBuilder.h:176
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition: IRBuilder.h:1803
AtomicRMWInst * CreateAtomicRMW(AtomicRMWInst::BinOp Op, Value *Ptr, Value *Val, MaybeAlign Align, AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Definition: IRBuilder.h:1854
BranchInst * CreateBr(BasicBlock *Dest)
Create an unconditional 'br label X' instruction.
Definition: IRBuilder.h:1114
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:180
Value * CreateAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2132
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2666
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
Definition: Instruction.h:341
const BasicBlock * getParent() const
Definition: Instruction.h:152
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:87
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1636
void getAllMetadata(SmallVectorImpl< std::pair< unsigned, MDNode * > > &MDs) const
Get all metadata attached to this Instruction.
Definition: Instruction.h:377
void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Class to represent integer types.
Definition: DerivedTypes.h:40
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:47
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:54
constexpr unsigned getScalarSizeInBits() const
Definition: LowLevelType.h:267
constexpr bool isScalar() const
Definition: LowLevelType.h:146
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
Definition: LowLevelType.h:42
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
Definition: LowLevelType.h:57
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
Definition: LowLevelType.h:193
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
Definition: LowLevelType.h:221
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
void getSyncScopeNames(SmallVectorImpl< StringRef > &SSNs) const
getSyncScopeNames - Populates client supplied SmallVector with synchronization scope names registered...
An instruction for reading from memory.
Definition: Instructions.h:184
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:286
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
Definition: Instructions.h:266
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
bool isCompare() const
Return true if this instruction is a comparison.
Definition: MCInstrDesc.h:341
bool hasImplicitDefOfPhysReg(unsigned Reg, const MCRegisterInfo *MRI=nullptr) const
Return true if this instruction implicitly defines the specified physical register.
Definition: MCInstrDesc.cpp:32
Wrapper class representing physical registers. Should be passed by value.
Definition: MCRegister.h:33
Metadata node.
Definition: Metadata.h:1067
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
Machine Value Type.
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:585
bool bitsLT(MVT VT) const
Return true if this has less bits than VT.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool isScalarInteger() const
Return true if this is an integer, not including vectors.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
void removeSuccessor(MachineBasicBlock *Succ, bool NormalizeSuccProbs=false)
Remove successor from the successors list of this MachineBasicBlock.
MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
Align getAlignment() const
Return alignment of the basic block.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
Definition: MachineInstr.h:69
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:568
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setReg(Register Reg)
Change the register this operand corresponds to.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
bool isInvariant() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
Definition: ModRef.h:198
bool doesNotAccessMemory() const
Whether this function accesses no memory.
Definition: ModRef.h:192
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
Definition: ModRef.h:195
Root of the metadata hierarchy.
Definition: Metadata.h:62
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
The optimization diagnostic interface.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1827
Register getReg() const
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
Definition: Register.h:84
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:95
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
This class provides iterator support for SDUse operands that use a specific SDNode.
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool isDivergent() const
bool hasOneUse() const
Return true if there is exactly one use of this node.
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
use_iterator use_begin() const
Provide iteration support to walk over all uses of an SDNode.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
static use_iterator use_end()
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction.
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
bool checkForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI, const TargetInstrInfo *TII, unsigned &PhysReg, int &Cost) const override
Allows the target to handle physreg-carried dependency in target-specific way.
EVT getOptimalMemOpType(const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::v5i32 because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
SDValue lowerDYNAMIC_STACKALLOCImpl(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
bool isMemOpUniform(const SDNode *N) const
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
void computeKnownBitsForTargetInstr(GISelKnownBits &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
bool getAddrModeArguments(IntrinsicInst *, SmallVectorImpl< Value * > &, Type *&) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Align computeKnownAlignForTargetInstr(GISelKnownBits &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:225
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:722
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
Definition: SelectionDAG.h:954
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
Definition: SelectionDAG.h:551
bool isKnownNeverSNaN(SDValue Op, unsigned Depth=0) const
SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS)
Helper function to make it easier to build Select's if you just have operands and don't want to check...
const Pass * getPass() const
Definition: SelectionDAG.h:470
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
Definition: SelectionDAG.h:478
std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
Definition: SelectionDAG.h:828
SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, bool isTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), AAResults *AA=nullptr)
SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
Definition: SelectionDAG.h:472
SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
SDValue getRegister(unsigned Reg, EVT VT)
void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
static const fltSemantics & EVTToAPFloatSemantics(EVT VT)
Returns an APFloat semantics tag appropriate for the given type.
const TargetMachine & getTarget() const
Definition: SelectionDAG.h:473
SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, unsigned Reg, SDValue N)
Definition: SelectionDAG.h:773
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond)
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
SDValue getValueType(EVT)
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
bool isKnownNeverNaN(SDValue Op, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:676
unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:469
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, unsigned Reg, EVT VT)
Definition: SelectionDAG.h:799
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
Definition: SelectionDAG.h:845
SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getRegisterMask(const uint32_t *RegMask)
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
SDValue getCondCode(ISD::CondCode Cond)
bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
Definition: SelectionDAG.h:485
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
Definition: SelectionDAG.h:560
SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL, bool LegalTypes=true)
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition: SelectionDAG.h:554
std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:342
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:427
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:696
iterator insert(iterator I, T &&Elt)
Definition: SmallVector.h:818
void resize(size_type N)
Definition: SmallVector.h:651
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
An instruction for storing to memory.
Definition: Instructions.h:317
This class is used to represent ISD::STORE nodes.
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
Definition: StringRef.h:839
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
bool starts_with(StringRef Prefix) const
Check if this string starts with the given Prefix.
Definition: StringRef.h:257
constexpr size_t size() const
size - Get the string size.
Definition: StringRef.h:137
constexpr const char * data() const
data - Get a pointer to the start of the string (which may not be null terminated).
Definition: StringRef.h:131
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition: StringRef.h:266
A switch()-like statement whose cases are string literals.
Definition: StringSwitch.h:44
StringSwitch & Case(StringLiteral S, T Value)
Definition: StringSwitch.h:69
R Default(T Value)
Definition: StringSwitch.h:182
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:76
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
TargetOptions Options
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
int getCopyCost() const
Return the cost of copying a value between two registers in this class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
OSType getOS() const
Get the parsed operating system type of this triple.
Definition: Triple.h:381
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:330
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
const fltSemantics & getFltSemantics() const
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:154
bool isSized(SmallPtrSetImpl< Type * > *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
Definition: Type.h:302
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:129
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition: Type.h:157
bool isFunctionTy() const
True if this is an instance of FunctionType.
Definition: Type.h:246
static IntegerType * getInt32Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:140
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:348
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
Value * getOperand(unsigned i) const
Definition: User.h:169
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
bool use_empty() const
Definition: Value.h:344
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1074
iterator_range< use_iterator > uses()
Definition: Value.h:376
void takeName(Value *V)
Transfer the name from V to this value.
Definition: Value.cpp:383
constexpr bool isZero() const
Definition: TypeSize.h:156
self_iterator getIterator()
Definition: ilist_node.h:109
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: Lint.cpp:86
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
@ CLAMP
CLAMP value between 0.0 and 1.0.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
Definition: AMDGPU.h:415
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE bool isKernel(CallingConv::ID CC)
bool isGFX11(const MCSubtargetInfo &STI)
bool isCompute(CallingConv::ID cc)
unsigned getAMDHSACodeObjectVersion(const Module &M)
bool isChainCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool getMUBUFTfe(unsigned Opc)
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
bool isShader(CallingConv::ID cc)
bool isGFX10Plus(const MCSubtargetInfo &STI)
LLVM_READONLY int getVOPe64(uint16_t Opcode)
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
bool isExtendedGlobalAddrSpace(unsigned AS)
Definition: AMDGPU.h:422
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
bool isGraphics(CallingConv::ID cc)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
Definition: CallingConv.h:197
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ MaxID
The highest possible ID. Must be some 2^k - 1.
Definition: CallingConv.h:271
@ AMDGPU_Gfx
Used for AMD graphics targets.
Definition: CallingConv.h:232
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:249
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:245
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:194
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:750
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition: ISDOpcodes.h:236
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
Definition: ISDOpcodes.h:1128
@ CTLZ_ZERO_UNDEF
Definition: ISDOpcodes.h:723
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition: ISDOpcodes.h:44
@ SET_FPENV
Sets the current floating-point environment.
Definition: ISDOpcodes.h:1004
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:250
@ ATOMIC_LOAD_NAND
Definition: ISDOpcodes.h:1275
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition: ISDOpcodes.h:559
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:714
@ ConstantFP
Definition: ISDOpcodes.h:77
@ ATOMIC_LOAD_MAX
Definition: ISDOpcodes.h:1277
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, ptr, val) This corresponds to "store atomic" instruction.
Definition: ISDOpcodes.h:1247
@ ATOMIC_LOAD_UMIN
Definition: ISDOpcodes.h:1278
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition: ISDOpcodes.h:487
@ FMAXNUM_IEEE
Definition: ISDOpcodes.h:985
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:239
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1037
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition: ISDOpcodes.h:783
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:483
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition: ISDOpcodes.h:199
@ RETURNADDR
Definition: ISDOpcodes.h:95
@ GlobalAddress
Definition: ISDOpcodes.h:78
@ ATOMIC_CMP_SWAP_WITH_SUCCESS
Val, Success, OUTCHAIN = ATOMIC_CMP_SWAP_WITH_SUCCESS(INCHAIN, ptr, cmp, swap) N.b.
Definition: ISDOpcodes.h:1260
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:790
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition: ISDOpcodes.h:543
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:390
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
Definition: ISDOpcodes.h:913
@ FPTRUNC_ROUND
Definition: ISDOpcodes.h:480
@ ATOMIC_LOAD_OR
Definition: ISDOpcodes.h:1273
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:903
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition: ISDOpcodes.h:229
@ ATOMIC_LOAD_XOR
Definition: ISDOpcodes.h:1274
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
Definition: ISDOpcodes.h:939
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
Definition: ISDOpcodes.h:1406
@ ATOMIC_LOAD_FADD
Definition: ISDOpcodes.h:1280
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:774
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition: ISDOpcodes.h:620
@ READSTEADYCOUNTER
READSTEADYCOUNTER - This corresponds to the readfixedcounter intrinsic.
Definition: ISDOpcodes.h:1194
@ BR
Control flow instructions. These all have token chains.
Definition: ISDOpcodes.h:1053
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition: ISDOpcodes.h:722
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
Definition: ISDOpcodes.h:1227
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
Definition: ISDOpcodes.h:994
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:930
@ BR_CC
BR_CC - Conditional branch.
Definition: ISDOpcodes.h:1083
@ ATOMIC_LOAD_MIN
Definition: ISDOpcodes.h:1276
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition: ISDOpcodes.h:500
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
Definition: ISDOpcodes.h:507
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition: ISDOpcodes.h:349
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:727
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
Definition: ISDOpcodes.h:1243
@ UNDEF
UNDEF - An undefined node.
Definition: ISDOpcodes.h:211
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition: ISDOpcodes.h:222
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition: ISDOpcodes.h:208
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition: ISDOpcodes.h:880
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:651
@ GET_FPMODE
Reads the current dynamic floating-point control modes.
Definition: ISDOpcodes.h:1022
@ GET_FPENV
Gets the current floating-point environment.
Definition: ISDOpcodes.h:999
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:705
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:600
@ ATOMIC_LOAD_AND
Definition: ISDOpcodes.h:1271
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition: ISDOpcodes.h:573
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
Definition: ISDOpcodes.h:984
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:535
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:203
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:780
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
Definition: ISDOpcodes.h:1217
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition: ISDOpcodes.h:742
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
Definition: ISDOpcodes.h:1254
@ ATOMIC_LOAD_UMAX
Definition: ISDOpcodes.h:1279
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:971
@ SMULO
Same for multiplication.
Definition: ISDOpcodes.h:331
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
Definition: ISDOpcodes.h:1047
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:798
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:674
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:888
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:303
@ INLINEASM_BR
INLINEASM_BR - Branching version of inline asm. Used by asm-goto.
Definition: ISDOpcodes.h:1103
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
Definition: ISDOpcodes.h:922
@ ATOMIC_LOAD_UDEC_WRAP
Definition: ISDOpcodes.h:1285
@ ATOMIC_LOAD_ADD
Definition: ISDOpcodes.h:1269
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition: ISDOpcodes.h:465
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:990
@ ATOMIC_LOAD_SUB
Definition: ISDOpcodes.h:1270
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:836
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
Definition: ISDOpcodes.h:1188
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:470
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:680
@ TRAP
TRAP - Trapping instruction.
Definition: ISDOpcodes.h:1214
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:184
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition: ISDOpcodes.h:524
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition: ISDOpcodes.h:52
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
Definition: ISDOpcodes.h:1268
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
Definition: ISDOpcodes.h:944
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:869
@ STRICT_FLDEXP
Definition: ISDOpcodes.h:414
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
Definition: ISDOpcodes.h:907
@ INLINEASM
INLINEASM - Represents an inline asm block.
Definition: ISDOpcodes.h:1100
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:786
@ BRCOND
BRCOND - Conditional branch.
Definition: ISDOpcodes.h:1076
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition: ISDOpcodes.h:763
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition: ISDOpcodes.h:61
@ ATOMIC_LOAD_UINC_WRAP
Definition: ISDOpcodes.h:1284
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition: ISDOpcodes.h:493
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition: ISDOpcodes.h:340
@ AssertZext
Definition: ISDOpcodes.h:62
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:192
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition: ISDOpcodes.h:515
CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1529
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
Definition: ISDOpcodes.h:1509
StringRef getName(ID id)
Return the LLVM name for an intrinsic, such as "llvm.ppc.altivec.lvx".
Definition: Function.cpp:1023
AttributeList getAttributes(LLVMContext &C, ID id)
Return the attributes for an intrinsic.
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Offsets
Offsets in bytes from the start of the input buffer.
Definition: SIInstrInfo.h:1531
@ System
Synchronized with respect to all concurrently executing threads.
Definition: LLVMContext.h:57
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
constexpr double inv_pi
Definition: MathExtras.h:38
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
@ Offset
Definition: DWP.cpp:456
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
Definition: Analysis.cpp:233
void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
Definition: MathExtras.h:228
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:428
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
@ Done
Definition: Threading.h:61
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition: Error.h:198
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition: bit.h:317
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2073
ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:372
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition: MathExtras.h:269
bool isReleaseOrStronger(AtomicOrdering AO)
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition: SIInstrInfo.h:41
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:324
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition: bit.h:281
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:275
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:138
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:156
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
Definition: Analysis.cpp:199
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:143
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
unsigned getUndefRegState(bool B)
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
@ AfterLegalizeDAG
Definition: DAGCombine.h:19
@ AfterLegalizeVectorOps
Definition: DAGCombine.h:18
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ Add
Sum of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
DWARFExpression::Operation Op
@ TowardPositive
roundTowardPositive.
@ TowardNegative
roundTowardNegative.
unsigned M0(unsigned Val)
Definition: VE.h:375
int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
Definition: MathExtras.h:219
ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:191
@ DS_Warning
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1749
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition: SIInstrInfo.h:45
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1879
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew=0)
Returns the largest uint64_t less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:439
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
SDValue SrcOp
int64_t DWordOffset
int64_t PermMask
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
static constexpr uint64_t encode(Fields... Values)
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
static const fltSemantics & IEEEsingle() LLVM_READNONE
Definition: APFloat.cpp:249
static constexpr roundingMode rmNearestTiesToEven
Definition: APFloat.h:230
static const fltSemantics & IEEEhalf() LLVM_READNONE
Definition: APFloat.cpp:247
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Definition: SCCPSolver.h:41
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
Extended Value Type.
Definition: ValueTypes.h:34
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition: ValueTypes.h:380
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:136
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition: ValueTypes.h:73
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition: ValueTypes.h:120
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition: ValueTypes.h:290
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:146
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:358
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition: ValueTypes.h:233
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:370
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition: ValueTypes.h:455
static EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:628
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition: ValueTypes.h:397
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:306
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition: ValueTypes.h:64
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:167
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:313
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition: ValueTypes.h:246
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:202
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:318
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:156
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:326
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition: ValueTypes.h:151
unsigned getPointerAddrSpace() const
unsigned getByValSize() const
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
unsigned getOrigArgIndex() const
bool isUnknown() const
Returns true if we don't know any bits.
Definition: KnownBits.h:63
void resetAll()
Resets the known state of all bits.
Definition: KnownBits.h:71
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition: KnownBits.h:292
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition: KnownBits.h:244
This class contains a discriminated union of information about pointers in memory operands,...
static MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.
static MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
unsigned int NumVTs
bool DX10Clamp
Used by the vector ALU to force DX10-style treatment of NaNs: when set, clamp NaN to zero; otherwise,...
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs
SmallVector< SDValue, 32 > OutVals