LLVM 23.0.0git
SIISelLowering.cpp
Go to the documentation of this file.
1//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Custom DAG lowering for SI
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIISelLowering.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPULaneMaskUtils.h"
19#include "AMDGPUTargetMachine.h"
20#include "GCNSubtarget.h"
23#include "SIRegisterInfo.h"
24#include "llvm/ADT/APInt.h"
26#include "llvm/ADT/Statistic.h"
42#include "llvm/IR/IRBuilder.h"
44#include "llvm/IR/IntrinsicsAMDGPU.h"
45#include "llvm/IR/IntrinsicsR600.h"
46#include "llvm/IR/MDBuilder.h"
49#include "llvm/Support/ModRef.h"
51#include <optional>
52
53using namespace llvm;
54using namespace llvm::SDPatternMatch;
55
56#define DEBUG_TYPE "si-lower"
57
58STATISTIC(NumTailCalls, "Number of tail calls");
59
60static cl::opt<bool>
61 DisableLoopAlignment("amdgpu-disable-loop-alignment",
62 cl::desc("Do not align and prefetch loops"),
63 cl::init(false));
64
66 "amdgpu-use-divergent-register-indexing", cl::Hidden,
67 cl::desc("Use indirect register addressing for divergent indexes"),
68 cl::init(false));
69
74
79
80static unsigned findFirstFreeSGPR(CCState &CCInfo) {
81 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
82 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
83 if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
84 return AMDGPU::SGPR0 + Reg;
85 }
86 }
87 llvm_unreachable("Cannot allocate sgpr");
88}
89
91 const GCNSubtarget &STI)
92 : AMDGPUTargetLowering(TM, STI, STI), Subtarget(&STI) {
93 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
94 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
95
96 addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
97
98 const SIRegisterInfo *TRI = STI.getRegisterInfo();
99 const TargetRegisterClass *V32RegClass =
100 TRI->getDefaultVectorSuperClassForBitWidth(32);
101 addRegisterClass(MVT::f32, V32RegClass);
102
103 addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
104
105 const TargetRegisterClass *V64RegClass =
106 TRI->getDefaultVectorSuperClassForBitWidth(64);
107
108 addRegisterClass(MVT::f64, V64RegClass);
109 addRegisterClass(MVT::v2f32, V64RegClass);
110 addRegisterClass(MVT::Untyped, V64RegClass);
111
112 addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
113 addRegisterClass(MVT::v3f32, TRI->getDefaultVectorSuperClassForBitWidth(96));
114
115 addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
116 addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
117
118 addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
119 addRegisterClass(MVT::v4f32, TRI->getDefaultVectorSuperClassForBitWidth(128));
120
121 addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
122 addRegisterClass(MVT::v5f32, TRI->getDefaultVectorSuperClassForBitWidth(160));
123
124 addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass);
125 addRegisterClass(MVT::v6f32, TRI->getDefaultVectorSuperClassForBitWidth(192));
126
127 addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass);
128 addRegisterClass(MVT::v3f64, TRI->getDefaultVectorSuperClassForBitWidth(192));
129
130 addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass);
131 addRegisterClass(MVT::v7f32, TRI->getDefaultVectorSuperClassForBitWidth(224));
132
133 addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
134 addRegisterClass(MVT::v8f32, TRI->getDefaultVectorSuperClassForBitWidth(256));
135
136 addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
137 addRegisterClass(MVT::v4f64, TRI->getDefaultVectorSuperClassForBitWidth(256));
138
139 addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass);
140 addRegisterClass(MVT::v9f32, TRI->getDefaultVectorSuperClassForBitWidth(288));
141
142 addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass);
143 addRegisterClass(MVT::v10f32,
144 TRI->getDefaultVectorSuperClassForBitWidth(320));
145
146 addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass);
147 addRegisterClass(MVT::v11f32,
148 TRI->getDefaultVectorSuperClassForBitWidth(352));
149
150 addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass);
151 addRegisterClass(MVT::v12f32,
152 TRI->getDefaultVectorSuperClassForBitWidth(384));
153
154 addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
155 addRegisterClass(MVT::v16f32,
156 TRI->getDefaultVectorSuperClassForBitWidth(512));
157
158 addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);
159 addRegisterClass(MVT::v8f64, TRI->getDefaultVectorSuperClassForBitWidth(512));
160
161 addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);
162 addRegisterClass(MVT::v16f64,
163 TRI->getDefaultVectorSuperClassForBitWidth(1024));
164
165 if (Subtarget->has16BitInsts()) {
166 if (Subtarget->useRealTrue16Insts()) {
167 addRegisterClass(MVT::i16, &AMDGPU::VGPR_16RegClass);
168 addRegisterClass(MVT::f16, &AMDGPU::VGPR_16RegClass);
169 addRegisterClass(MVT::bf16, &AMDGPU::VGPR_16RegClass);
170 } else {
171 addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
172 addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
173 addRegisterClass(MVT::bf16, &AMDGPU::SReg_32RegClass);
174 }
175
176 // Unless there are also VOP3P operations, not operations are really legal.
177 addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass);
178 addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass);
179 addRegisterClass(MVT::v2bf16, &AMDGPU::SReg_32RegClass);
180 addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
181 addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
182 addRegisterClass(MVT::v4bf16, &AMDGPU::SReg_64RegClass);
183 addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass);
184 addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass);
185 addRegisterClass(MVT::v8bf16, &AMDGPU::SGPR_128RegClass);
186 addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass);
187 addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass);
188 addRegisterClass(MVT::v16bf16, &AMDGPU::SGPR_256RegClass);
189 addRegisterClass(MVT::v32i16, &AMDGPU::SGPR_512RegClass);
190 addRegisterClass(MVT::v32f16, &AMDGPU::SGPR_512RegClass);
191 addRegisterClass(MVT::v32bf16, &AMDGPU::SGPR_512RegClass);
192 }
193
194 addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
195 addRegisterClass(MVT::v32f32,
196 TRI->getDefaultVectorSuperClassForBitWidth(1024));
197
198 computeRegisterProperties(Subtarget->getRegisterInfo());
199
200 // The boolean content concept here is too inflexible. Compares only ever
201 // really produce a 1-bit result. Any copy/extend from these will turn into a
202 // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as
203 // it's what most targets use.
206
207 // We need to custom lower vector stores from local memory
209 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
210 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
211 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
212 MVT::i1, MVT::v32i32},
213 Custom);
214
216 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
217 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
218 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
219 MVT::i1, MVT::v32i32},
220 Custom);
221
222 if (isTypeLegal(MVT::bf16)) {
223 for (unsigned Opc :
232 ISD::SETCC}) {
233 setOperationAction(Opc, MVT::bf16, Promote);
234 }
235
237
239 AddPromotedToType(ISD::SELECT, MVT::bf16, MVT::i16);
240
244
245 // We only need to custom lower because we can't specify an action for bf16
246 // sources.
249 }
250
251 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
252 setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand);
253 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
254 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
255 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
256 setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
257 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
258 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
259 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
260 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
261 setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
262 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
263 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand);
264 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand);
265 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand);
266 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand);
267
268 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
269 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
270 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand);
271 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand);
272 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand);
273 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Expand);
274 setTruncStoreAction(MVT::v16i64, MVT::v16i32, Expand);
275
276 setOperationAction(ISD::GlobalAddress, {MVT::i32, MVT::i64}, Custom);
277 setOperationAction(ISD::ExternalSymbol, {MVT::i32, MVT::i64}, Custom);
278
282 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
283
284 setOperationAction(ISD::FSQRT, {MVT::f32, MVT::f64}, Custom);
285
287 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand);
288
290 setOperationAction(ISD::SETCC, {MVT::v2i1, MVT::v4i1}, Expand);
291 AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
292
294 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
295 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
296 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
297 Expand);
299 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
300 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
301 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
302 Expand);
303
305 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
306 MVT::v3i16, MVT::v4i16, MVT::Other},
307 Custom);
308
311 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Expand);
312
314
316
318 Expand);
319
320#if 0
322#endif
323
324 // We only support LOAD/STORE and vector manipulation ops for vectors
325 // with > 4 elements.
326 for (MVT VT :
327 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
328 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
329 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
330 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
331 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
332 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
333 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
334 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
335 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
336 switch (Op) {
337 case ISD::LOAD:
338 case ISD::STORE:
340 case ISD::BITCAST:
341 case ISD::UNDEF:
345 case ISD::IS_FPCLASS:
346 break;
351 break;
352 default:
354 break;
355 }
356 }
357 }
358
360
361 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
362 // is expanded to avoid having two separate loops in case the index is a VGPR.
363
364 // Most operations are naturally 32-bit vector operations. We only support
365 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
366 for (MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
368 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
369
371 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
372
374 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
375
377 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
378 }
379
380 for (MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
382 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32);
383
385 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v6i32);
386
388 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v6i32);
389
391 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v6i32);
392 }
393
394 for (MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
396 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32);
397
399 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v8i32);
400
402 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v8i32);
403
405 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v8i32);
406 }
407
408 for (MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
410 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v16i32);
411
413 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v16i32);
414
416 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v16i32);
417
419 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v16i32);
420 }
421
422 for (MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
424 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v32i32);
425
427 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v32i32);
428
430 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v32i32);
431
433 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v32i32);
434 }
435
437 {MVT::v4i32, MVT::v4f32, MVT::v8i32, MVT::v8f32,
438 MVT::v16i32, MVT::v16f32, MVT::v32i32, MVT::v32f32},
439 Custom);
440
441 if (Subtarget->hasPkMovB32()) {
442 // TODO: 16-bit element vectors should be legal with even aligned elements.
443 // TODO: Can be legal with wider source types than the result with
444 // subregister extracts.
445 setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v2i32, MVT::v2f32}, Legal);
446 }
447
449 // Prevent SELECT v2i32 from being implemented with the above bitwise ops and
450 // instead lower to cndmask in SITargetLowering::LowerSELECT().
452 // Enable MatchRotate to produce ISD::ROTR, which is later transformed to
453 // alignbit.
454 setOperationAction(ISD::ROTR, MVT::v2i32, Custom);
455
456 setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
457 Custom);
458
459 // Avoid stack access for these.
460 // TODO: Generalize to more vector types.
462 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
463 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
464 Custom);
465
466 // Deal with vec3 vector operations when widened to vec4.
468 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32}, Custom);
469
470 // Deal with vec5/6/7 vector operations when widened to vec8.
472 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
473 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
474 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
475 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
476 Custom);
477
478 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
479 // and output demarshalling
480 setOperationAction(ISD::ATOMIC_CMP_SWAP, {MVT::i32, MVT::i64}, Custom);
481
482 // We can't return success/failure, only the old value,
483 // let LLVM add the comparison
485 Expand);
486
487 setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);
488
489 setOperationAction(ISD::BITREVERSE, {MVT::i32, MVT::i64}, Legal);
490
491 // FIXME: This should be narrowed to i32, but that only happens if i64 is
492 // illegal.
493 // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
494 setOperationAction(ISD::BSWAP, {MVT::i64, MVT::i32}, Legal);
495
496 // On SI this is s_memtime and s_memrealtime on VI.
498
499 if (Subtarget->hasSMemRealTime() ||
500 Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11)
503
504 if (Subtarget->has16BitInsts()) {
507 setOperationAction(ISD::IS_FPCLASS, {MVT::f16, MVT::f32, MVT::f64}, Legal);
510 } else {
512 }
513
514 if (Subtarget->hasMadMacF32Insts())
516
519
520 // We only really have 32-bit BFE instructions (and 16-bit on VI).
521 //
522 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
523 // effort to match them now. We want this to be false for i64 cases when the
524 // extraction isn't restricted to the upper or lower half. Ideally we would
525 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
526 // span the midpoint are probably relatively rare, so don't worry about them
527 // for now.
529
530 // Clamp modifier on add/sub
531 if (Subtarget->hasIntClamp())
533
534 if (Subtarget->hasAddNoCarryInsts())
535 setOperationAction({ISD::SADDSAT, ISD::SSUBSAT}, {MVT::i16, MVT::i32},
536 Legal);
537
540 {MVT::f32, MVT::f64}, Custom);
541
542 // These are really only legal for ieee_mode functions. We should be avoiding
543 // them for functions that don't have ieee_mode enabled, so just say they are
544 // legal.
546 {MVT::f32, MVT::f64}, Legal);
547
548 if (Subtarget->haveRoundOpsF64())
550 Legal);
551 else
553 MVT::f64, Custom);
554
556 setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, {MVT::f32, MVT::f64},
557 Legal);
558 setOperationAction(ISD::FFREXP, {MVT::f32, MVT::f64}, Custom);
559
562
563 setOperationAction(ISD::BF16_TO_FP, {MVT::i16, MVT::f32, MVT::f64}, Expand);
564 setOperationAction(ISD::FP_TO_BF16, {MVT::i16, MVT::f32, MVT::f64}, Expand);
565
567 Custom);
569 Custom);
571 Custom);
572
573 // Custom lower these because we can't specify a rule based on an illegal
574 // source bf16.
577
578 if (Subtarget->has16BitInsts()) {
581 MVT::i16, Legal);
582
583 AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
584
586 MVT::i16, Expand);
587
591 ISD::CTPOP},
592 MVT::i16, Promote);
593
595
596 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
597
599 AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
601 AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
602
606
608
609 // F16 - Constant Actions.
612
613 // F16 - Load/Store Actions.
615 AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
617 AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
618
619 // BF16 - Load/Store Actions.
621 AddPromotedToType(ISD::LOAD, MVT::bf16, MVT::i16);
623 AddPromotedToType(ISD::STORE, MVT::bf16, MVT::i16);
624
625 // F16 - VOP1 Actions.
628 MVT::f16, Custom);
629
630 // BF16 - VOP1 Actions.
631 if (Subtarget->hasBF16TransInsts())
633
636 MVT::f16, Promote);
639 MVT::bf16, Promote);
640
641 // F16 - VOP2 Actions.
642 setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, {MVT::f16, MVT::bf16},
643 Expand);
647
648 // F16 - VOP3 Actions.
650 if (STI.hasMadF16())
652
653 for (MVT VT :
654 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
655 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
656 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
657 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
658 switch (Op) {
659 case ISD::LOAD:
660 case ISD::STORE:
662 case ISD::BITCAST:
663 case ISD::UNDEF:
668 case ISD::IS_FPCLASS:
669 break;
672 case ISD::FSIN:
673 case ISD::FCOS:
675 break;
676 default:
678 break;
679 }
680 }
681 }
682
683 // v_perm_b32 can handle either of these.
684 setOperationAction(ISD::BSWAP, {MVT::i16, MVT::v2i16}, Legal);
686
687 // XXX - Do these do anything? Vector constants turn into build_vector.
688 setOperationAction(ISD::Constant, {MVT::v2i16, MVT::v2f16}, Legal);
689
690 setOperationAction(ISD::UNDEF, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
691 Legal);
692
694 AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
696 AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
697
699 AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
701 AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
702
703 setOperationAction(ISD::AND, MVT::v2i16, Promote);
704 AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
705 setOperationAction(ISD::OR, MVT::v2i16, Promote);
706 AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
707 setOperationAction(ISD::XOR, MVT::v2i16, Promote);
708 AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
709
711 AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
713 AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
714 setOperationAction(ISD::LOAD, MVT::v4bf16, Promote);
715 AddPromotedToType(ISD::LOAD, MVT::v4bf16, MVT::v2i32);
716
718 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
720 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
722 AddPromotedToType(ISD::STORE, MVT::v4bf16, MVT::v2i32);
723
725 AddPromotedToType(ISD::LOAD, MVT::v8i16, MVT::v4i32);
727 AddPromotedToType(ISD::LOAD, MVT::v8f16, MVT::v4i32);
728 setOperationAction(ISD::LOAD, MVT::v8bf16, Promote);
729 AddPromotedToType(ISD::LOAD, MVT::v8bf16, MVT::v4i32);
730
732 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
734 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
735
737 AddPromotedToType(ISD::STORE, MVT::v8i16, MVT::v4i32);
739 AddPromotedToType(ISD::STORE, MVT::v8f16, MVT::v4i32);
741 AddPromotedToType(ISD::STORE, MVT::v8bf16, MVT::v4i32);
742
743 setOperationAction(ISD::LOAD, MVT::v16i16, Promote);
744 AddPromotedToType(ISD::LOAD, MVT::v16i16, MVT::v8i32);
745 setOperationAction(ISD::LOAD, MVT::v16f16, Promote);
746 AddPromotedToType(ISD::LOAD, MVT::v16f16, MVT::v8i32);
747 setOperationAction(ISD::LOAD, MVT::v16bf16, Promote);
748 AddPromotedToType(ISD::LOAD, MVT::v16bf16, MVT::v8i32);
749
751 AddPromotedToType(ISD::STORE, MVT::v16i16, MVT::v8i32);
753 AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32);
754 setOperationAction(ISD::STORE, MVT::v16bf16, Promote);
755 AddPromotedToType(ISD::STORE, MVT::v16bf16, MVT::v8i32);
756
757 setOperationAction(ISD::LOAD, MVT::v32i16, Promote);
758 AddPromotedToType(ISD::LOAD, MVT::v32i16, MVT::v16i32);
759 setOperationAction(ISD::LOAD, MVT::v32f16, Promote);
760 AddPromotedToType(ISD::LOAD, MVT::v32f16, MVT::v16i32);
761 setOperationAction(ISD::LOAD, MVT::v32bf16, Promote);
762 AddPromotedToType(ISD::LOAD, MVT::v32bf16, MVT::v16i32);
763
765 AddPromotedToType(ISD::STORE, MVT::v32i16, MVT::v16i32);
767 AddPromotedToType(ISD::STORE, MVT::v32f16, MVT::v16i32);
768 setOperationAction(ISD::STORE, MVT::v32bf16, Promote);
769 AddPromotedToType(ISD::STORE, MVT::v32bf16, MVT::v16i32);
770
772 MVT::v2i32, Expand);
774
776 MVT::v4i32, Expand);
777
779 MVT::v8i32, Expand);
780
781 setOperationAction(ISD::BUILD_VECTOR, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
782 Subtarget->hasVOP3PInsts() ? Legal : Custom);
783
784 setOperationAction(ISD::FNEG, {MVT::v2f16, MVT::v2bf16}, Legal);
785 // This isn't really legal, but this avoids the legalizer unrolling it (and
786 // allows matching fneg (fabs x) patterns)
787 setOperationAction(ISD::FABS, {MVT::v2f16, MVT::v2bf16}, Legal);
788
789 // Can do this in one BFI plus a constant materialize.
791 {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16,
792 MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16,
793 MVT::v32f16, MVT::v32bf16},
794 Custom);
795
798 MVT::f16, Custom);
800
803 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
804 Custom);
805
807 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
808 Expand);
809
810 for (MVT Vec16 :
811 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
812 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
815 Vec16, Custom);
817 }
818 }
819
820 if (Subtarget->hasVOP3PInsts()) {
824 MVT::v2i16, Legal);
825
828 MVT::v2f16, Legal);
829
831 {MVT::v2i16, MVT::v2f16, MVT::v2bf16}, Custom);
832
834 {MVT::v4f16, MVT::v4i16, MVT::v4bf16, MVT::v8f16,
835 MVT::v8i16, MVT::v8bf16, MVT::v16f16, MVT::v16i16,
836 MVT::v16bf16, MVT::v32f16, MVT::v32i16, MVT::v32bf16},
837 Custom);
838
839 for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
840 // Split vector operations.
845 VT, Custom);
846
847 for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
848 // Split vector operations.
850 VT, Custom);
851
854 {MVT::v2f16, MVT::v4f16}, Custom);
855
856 setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
857 setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16, MVT::v4bf16},
858 Custom);
859
860 if (Subtarget->hasBF16PackedInsts()) {
861 for (MVT VT : {MVT::v4bf16, MVT::v8bf16, MVT::v16bf16, MVT::v32bf16})
862 // Split vector operations.
864 VT, Custom);
865 }
866
867 if (Subtarget->hasPackedFP32Ops()) {
869 MVT::v2f32, Legal);
871 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
872 Custom);
873 }
874 }
875
877
878 if (Subtarget->has16BitInsts()) {
880 AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
882 AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
883 } else {
884 // Legalization hack.
885 setOperationAction(ISD::SELECT, {MVT::v2i16, MVT::v2f16}, Custom);
886
888 }
889
891 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
892 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
893 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
894 MVT::v32f16, MVT::v32bf16},
895 Custom);
896
898
899 if (Subtarget->hasVectorMulU64())
901 else if (Subtarget->hasScalarSMulU64())
903
904 if (Subtarget->hasMad64_32())
906
907 if (Subtarget->hasSafeSmemPrefetch() || Subtarget->hasVmemPrefInsts())
909
910 if (Subtarget->hasIEEEMinimumMaximumInsts()) {
912 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Legal);
913 } else {
914 // FIXME: For nnan fmaximum, emit the fmaximum3 instead of fmaxnum
915 if (Subtarget->hasMinimum3Maximum3F32())
917
918 if (Subtarget->hasMinimum3Maximum3PKF16()) {
920
921 // If only the vector form is available, we need to widen to a vector.
922 if (!Subtarget->hasMinimum3Maximum3F16())
924 }
925 }
926
927 if (Subtarget->hasVOP3PInsts()) {
928 // We want to break these into v2f16 pieces, not scalarize.
930 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
931 Custom);
932 }
933
934 if (Subtarget->hasIntMinMax64())
936 Legal);
937
939 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
940 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
941 MVT::i8},
942 Custom);
943
945 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
946 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
947 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
948 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
949 Custom);
950
952 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
953 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
954 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
955 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
956 Custom);
957
963
964 // TODO: Could move this to custom lowering, could benefit from combines on
965 // extract of relevant bits.
967
969
970 if (Subtarget->hasBF16ConversionInsts()) {
971 setOperationAction(ISD::FP_ROUND, {MVT::bf16, MVT::v2bf16}, Custom);
973 }
974
975 if (Subtarget->hasBF16PackedInsts()) {
978 MVT::v2bf16, Legal);
979 }
980
981 if (Subtarget->hasBF16TransInsts()) {
983 }
984
985 if (Subtarget->hasCvtPkF16F32Inst()) {
987 {MVT::v2f16, MVT::v4f16, MVT::v8f16, MVT::v16f16},
988 Custom);
989 }
990
994 ISD::SUB,
996 ISD::MUL,
997 ISD::FADD,
998 ISD::FSUB,
999 ISD::FDIV,
1000 ISD::FMUL,
1009 ISD::FMA,
1010 ISD::SMIN,
1011 ISD::SMAX,
1012 ISD::UMIN,
1013 ISD::UMAX,
1014 ISD::SETCC,
1016 ISD::SMIN,
1017 ISD::SMAX,
1018 ISD::UMIN,
1019 ISD::UMAX,
1020 ISD::AND,
1021 ISD::OR,
1022 ISD::XOR,
1023 ISD::SHL,
1024 ISD::SRL,
1025 ISD::SRA,
1026 ISD::FSHR,
1037
1038 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
1040
1041 // All memory operations. Some folding on the pointer operand is done to help
1042 // matching the constant offsets in the addressing modes.
1044 ISD::STORE,
1069
1070 // FIXME: In other contexts we pretend this is a per-function property.
1072
1074}
1075
1076const GCNSubtarget *SITargetLowering::getSubtarget() const { return Subtarget; }
1077
1079 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
1080 return RCRegs;
1081}
1082
1083//===----------------------------------------------------------------------===//
1084// TargetLowering queries
1085//===----------------------------------------------------------------------===//
1086
1087// v_mad_mix* support a conversion from f16 to f32.
1088//
1089// There is only one special case when denormals are enabled we don't currently,
1090// where this is OK to use.
1091bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
1092 EVT DestVT, EVT SrcVT) const {
1093 return DestVT.getScalarType() == MVT::f32 &&
1094 ((((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
1095 (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
1096 SrcVT.getScalarType() == MVT::f16) ||
1097 (Opcode == ISD::FMA && Subtarget->hasFmaMixBF16Insts() &&
1098 SrcVT.getScalarType() == MVT::bf16)) &&
1099 // TODO: This probably only requires no input flushing?
1101}
1102
1104 LLT DestTy, LLT SrcTy) const {
1105 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
1106 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
1107 DestTy.getScalarSizeInBits() == 32 &&
1108 SrcTy.getScalarSizeInBits() == 16 &&
1109 // TODO: This probably only requires no input flushing?
1110 denormalModeIsFlushAllF32(*MI.getMF());
1111}
1112
1114 // SI has some legal vector types, but no legal vector operations. Say no
1115 // shuffles are legal in order to prefer scalarizing some vector operations.
1116 return false;
1117}
1118
1120 CallingConv::ID CC,
1121 EVT VT) const {
1123 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
1124
1125 if (VT.isVector()) {
1126 EVT ScalarVT = VT.getScalarType();
1127 unsigned Size = ScalarVT.getSizeInBits();
1128 if (Size == 16) {
1129 return Subtarget->has16BitInsts()
1130 ? MVT::getVectorVT(ScalarVT.getSimpleVT(), 2)
1131 : MVT::i32;
1132 }
1133
1134 if (Size < 16)
1135 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1136 return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
1137 }
1138
1139 if (!Subtarget->has16BitInsts() && VT.getSizeInBits() == 16)
1140 return MVT::i32;
1141
1142 if (VT.getSizeInBits() > 32)
1143 return MVT::i32;
1144
1145 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
1146}
1147
1149 CallingConv::ID CC,
1150 EVT VT) const {
1152 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1153
1154 if (VT.isVector()) {
1155 unsigned NumElts = VT.getVectorNumElements();
1156 EVT ScalarVT = VT.getScalarType();
1157 unsigned Size = ScalarVT.getSizeInBits();
1158
1159 // FIXME: Should probably promote 8-bit vectors to i16.
1160 if (Size == 16)
1161 return (NumElts + 1) / 2;
1162
1163 if (Size <= 32)
1164 return NumElts;
1165
1166 if (Size > 32)
1167 return NumElts * ((Size + 31) / 32);
1168 } else if (VT.getSizeInBits() > 32)
1169 return (VT.getSizeInBits() + 31) / 32;
1170
1171 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1172}
1173
1175 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
1176 unsigned &NumIntermediates, MVT &RegisterVT) const {
1177 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
1178 unsigned NumElts = VT.getVectorNumElements();
1179 EVT ScalarVT = VT.getScalarType();
1180 unsigned Size = ScalarVT.getSizeInBits();
1181 // FIXME: We should fix the ABI to be the same on targets without 16-bit
1182 // support, but unless we can properly handle 3-vectors, it will be still be
1183 // inconsistent.
1184 if (Size == 16) {
1185 MVT SimpleIntermediateVT =
1187 IntermediateVT = SimpleIntermediateVT;
1188 RegisterVT = Subtarget->has16BitInsts() ? SimpleIntermediateVT : MVT::i32;
1189 NumIntermediates = (NumElts + 1) / 2;
1190 return (NumElts + 1) / 2;
1191 }
1192
1193 if (Size == 32) {
1194 RegisterVT = ScalarVT.getSimpleVT();
1195 IntermediateVT = RegisterVT;
1196 NumIntermediates = NumElts;
1197 return NumIntermediates;
1198 }
1199
1200 if (Size < 16 && Subtarget->has16BitInsts()) {
1201 // FIXME: Should probably form v2i16 pieces
1202 RegisterVT = MVT::i16;
1203 IntermediateVT = ScalarVT;
1204 NumIntermediates = NumElts;
1205 return NumIntermediates;
1206 }
1207
1208 if (Size != 16 && Size <= 32) {
1209 RegisterVT = MVT::i32;
1210 IntermediateVT = ScalarVT;
1211 NumIntermediates = NumElts;
1212 return NumIntermediates;
1213 }
1214
1215 if (Size > 32) {
1216 RegisterVT = MVT::i32;
1217 IntermediateVT = RegisterVT;
1218 NumIntermediates = NumElts * ((Size + 31) / 32);
1219 return NumIntermediates;
1220 }
1221 }
1222
1224 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1225}
1226
1228 const DataLayout &DL, Type *Ty,
1229 unsigned MaxNumLanes) {
1230 assert(MaxNumLanes != 0);
1231
1232 LLVMContext &Ctx = Ty->getContext();
1233 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1234 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1235 return EVT::getVectorVT(Ctx, TLI.getValueType(DL, VT->getElementType()),
1236 NumElts);
1237 }
1238
1239 return TLI.getValueType(DL, Ty);
1240}
1241
1242// Peek through TFE struct returns to only use the data size.
1244 const DataLayout &DL, Type *Ty,
1245 unsigned MaxNumLanes) {
1246 auto *ST = dyn_cast<StructType>(Ty);
1247 if (!ST)
1248 return memVTFromLoadIntrData(TLI, DL, Ty, MaxNumLanes);
1249
1250 // TFE intrinsics return an aggregate type.
1251 assert(ST->getNumContainedTypes() == 2 &&
1252 ST->getContainedType(1)->isIntegerTy(32));
1253 return memVTFromLoadIntrData(TLI, DL, ST->getContainedType(0), MaxNumLanes);
1254}
1255
1256/// Map address space 7 to MVT::amdgpuBufferFatPointer because that's its
1257/// in-memory representation. This return value is a custom type because there
1258/// is no MVT::i160 and adding one breaks integer promotion logic. While this
1259/// could cause issues during codegen, these address space 7 pointers will be
1260/// rewritten away by then. Therefore, we can return MVT::amdgpuBufferFatPointer
1261/// in order to allow pre-codegen passes that query TargetTransformInfo, often
1262/// for cost modeling, to work. (This also sets us up decently for doing the
1263/// buffer lowering in GlobalISel if SelectionDAG ever goes away.)
1265 if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160)
1266 return MVT::amdgpuBufferFatPointer;
1268 DL.getPointerSizeInBits(AS) == 192)
1269 return MVT::amdgpuBufferStridedPointer;
1271}
1272/// Similarly, the in-memory representation of a p7 is {p8, i32}, aka
1273/// v8i32 when padding is added.
1274/// The in-memory representation of a p9 is {p8, i32, i32}, which is
1275/// also v8i32 with padding.
1277 if ((AMDGPUAS::BUFFER_FAT_POINTER == AS &&
1278 DL.getPointerSizeInBits(AS) == 160) ||
1280 DL.getPointerSizeInBits(AS) == 192))
1281 return MVT::v8i32;
1283}
1284
1285static unsigned getIntrMemWidth(unsigned IntrID) {
1286 switch (IntrID) {
1287 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1288 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1289 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1290 return 8;
1291 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1292 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1293 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1294 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1295 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1296 return 32;
1297 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1298 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1299 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1300 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1301 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1302 return 64;
1303 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1304 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1305 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1306 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
1307 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
1308 return 128;
1309 default:
1310 llvm_unreachable("Unknown width");
1311 }
1312}
1313
1314static void getCoopAtomicOperandsInfo(const CallBase &CI, bool IsLoad,
1316 Value *OrderingArg = CI.getArgOperand(IsLoad ? 1 : 2);
1317 unsigned Ord = cast<ConstantInt>(OrderingArg)->getZExtValue();
1318 switch (AtomicOrderingCABI(Ord)) {
1320 Info.order = AtomicOrdering::Acquire;
1321 break;
1323 Info.order = AtomicOrdering::Release;
1324 break;
1327 break;
1328 default:
1329 Info.order = AtomicOrdering::Monotonic;
1330 break;
1331 }
1332
1333 Info.flags =
1335 Info.flags |= MOCooperative;
1336
1337 MDNode *ScopeMD = cast<MDNode>(
1338 cast<MetadataAsValue>(CI.getArgOperand(IsLoad ? 2 : 3))->getMetadata());
1339 StringRef Scope = cast<MDString>(ScopeMD->getOperand(0))->getString();
1340 Info.ssid = CI.getContext().getOrInsertSyncScopeID(Scope);
1341}
1342
1344 const CallBase &CI,
1345 MachineFunction &MF,
1346 unsigned IntrID) const {
1347 Info.flags = MachineMemOperand::MONone;
1348 if (CI.hasMetadata(LLVMContext::MD_invariant_load))
1349 Info.flags |= MachineMemOperand::MOInvariant;
1350 if (CI.hasMetadata(LLVMContext::MD_nontemporal))
1352 Info.flags |= getTargetMMOFlags(CI);
1353
1354 if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
1356 AttributeSet Attr =
1358 MemoryEffects ME = Attr.getMemoryEffects();
1359 if (ME.doesNotAccessMemory())
1360 return false;
1361
1362 // TODO: Should images get their own address space?
1363 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1364
1365 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = nullptr;
1366 if (RsrcIntr->IsImage) {
1367 const AMDGPU::ImageDimIntrinsicInfo *Intr =
1369 BaseOpcode = AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
1370 Info.align.reset();
1371 }
1372
1373 Value *RsrcArg = CI.getArgOperand(RsrcIntr->RsrcArg);
1374 if (auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->getType())) {
1375 if (RsrcPtrTy->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
1376 // We conservatively set the memory operand of a buffer intrinsic to the
1377 // base resource pointer, so that we can access alias information about
1378 // those pointers. Cases like "this points at the same value
1379 // but with a different offset" are handled in
1380 // areMemAccessesTriviallyDisjoint.
1381 Info.ptrVal = RsrcArg;
1382 }
1383
1384 bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
1385 if (!IsSPrefetch) {
1386 auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
1387 if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
1388 Info.flags |= MachineMemOperand::MOVolatile;
1389 }
1390
1392 if (ME.onlyReadsMemory()) {
1393 if (RsrcIntr->IsImage) {
1394 unsigned MaxNumLanes = 4;
1395
1396 if (!BaseOpcode->Gather4) {
1397 // If this isn't a gather, we may have excess loaded elements in the
1398 // IR type. Check the dmask for the real number of elements loaded.
1399 unsigned DMask =
1400 cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue();
1401 MaxNumLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1402 }
1403
1404 Info.memVT = memVTFromLoadIntrReturn(*this, MF.getDataLayout(),
1405 CI.getType(), MaxNumLanes);
1406 } else {
1407 Info.memVT =
1409 std::numeric_limits<unsigned>::max());
1410 }
1411
1412 // FIXME: What does alignment mean for an image?
1413 Info.opc = ISD::INTRINSIC_W_CHAIN;
1414 Info.flags |= MachineMemOperand::MOLoad;
1415 } else if (ME.onlyWritesMemory()) {
1416 Info.opc = ISD::INTRINSIC_VOID;
1417
1418 Type *DataTy = CI.getArgOperand(0)->getType();
1419 if (RsrcIntr->IsImage) {
1420 unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue();
1421 unsigned DMaskLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1422 Info.memVT = memVTFromLoadIntrData(*this, MF.getDataLayout(), DataTy,
1423 DMaskLanes);
1424 } else
1425 Info.memVT = getValueType(MF.getDataLayout(), DataTy);
1426
1427 Info.flags |= MachineMemOperand::MOStore;
1428 } else {
1429 // Atomic, NoReturn Sampler or prefetch
1430 Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID
1432 Info.flags |=
1434
1435 if (!IsSPrefetch)
1436 Info.flags |= MachineMemOperand::MOStore;
1437
1438 switch (IntrID) {
1439 default:
1440 if ((RsrcIntr->IsImage && BaseOpcode->NoReturn) || IsSPrefetch) {
1441 // Fake memory access type for no return sampler intrinsics
1442 Info.memVT = MVT::i32;
1443 } else {
1444 // XXX - Should this be volatile without known ordering?
1445 Info.flags |= MachineMemOperand::MOVolatile;
1446 Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
1447 }
1448 break;
1449 case Intrinsic::amdgcn_raw_buffer_load_lds:
1450 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1451 case Intrinsic::amdgcn_struct_buffer_load_lds:
1452 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1453 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1454 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1455 Info.ptrVal = CI.getArgOperand(1);
1456 return true;
1457 }
1458 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1459 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1460 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1461 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1462 Info.memVT =
1464 std::numeric_limits<unsigned>::max());
1465 Info.flags &= ~MachineMemOperand::MOStore;
1466 return true;
1467 }
1468 }
1469 }
1470 return true;
1471 }
1472
1473 switch (IntrID) {
1474 case Intrinsic::amdgcn_ds_ordered_add:
1475 case Intrinsic::amdgcn_ds_ordered_swap: {
1476 Info.opc = ISD::INTRINSIC_W_CHAIN;
1477 Info.memVT = MVT::getVT(CI.getType());
1478 Info.ptrVal = CI.getOperand(0);
1479 Info.align.reset();
1481
1482 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));
1483 if (!Vol->isZero())
1484 Info.flags |= MachineMemOperand::MOVolatile;
1485
1486 return true;
1487 }
1488 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1489 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1490 Info.opc = ISD::INTRINSIC_W_CHAIN;
1491 Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
1492 Info.ptrVal = nullptr;
1493 Info.fallbackAddressSpace = AMDGPUAS::STREAMOUT_REGISTER;
1495 return true;
1496 }
1497 case Intrinsic::amdgcn_ds_append:
1498 case Intrinsic::amdgcn_ds_consume: {
1499 Info.opc = ISD::INTRINSIC_W_CHAIN;
1500 Info.memVT = MVT::getVT(CI.getType());
1501 Info.ptrVal = CI.getOperand(0);
1502 Info.align.reset();
1504
1505 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));
1506 if (!Vol->isZero())
1507 Info.flags |= MachineMemOperand::MOVolatile;
1508
1509 return true;
1510 }
1511 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1512 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64: {
1513 Info.opc = (IntrID == Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64)
1516 Info.memVT = MVT::getVT(CI.getType());
1517 Info.ptrVal = CI.getOperand(0);
1518 Info.memVT = MVT::i64;
1519 Info.size = 8;
1520 Info.align.reset();
1522 return true;
1523 }
1524 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
1525 case Intrinsic::amdgcn_image_bvh_intersect_ray:
1526 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
1527 Info.opc = ISD::INTRINSIC_W_CHAIN;
1528 Info.memVT =
1529 MVT::getVT(IntrID == Intrinsic::amdgcn_image_bvh_intersect_ray
1530 ? CI.getType()
1532 ->getElementType(0)); // XXX: what is correct VT?
1533
1534 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1535 Info.align.reset();
1536 Info.flags |=
1538 return true;
1539 }
1540 case Intrinsic::amdgcn_global_atomic_fmin_num:
1541 case Intrinsic::amdgcn_global_atomic_fmax_num:
1542 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1543 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1544 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
1545 Info.opc = ISD::INTRINSIC_W_CHAIN;
1546 Info.memVT = MVT::getVT(CI.getType());
1547 Info.ptrVal = CI.getOperand(0);
1548 Info.align.reset();
1552 return true;
1553 }
1554 case Intrinsic::amdgcn_flat_load_monitor_b32:
1555 case Intrinsic::amdgcn_flat_load_monitor_b64:
1556 case Intrinsic::amdgcn_flat_load_monitor_b128:
1557 case Intrinsic::amdgcn_global_load_monitor_b32:
1558 case Intrinsic::amdgcn_global_load_monitor_b64:
1559 case Intrinsic::amdgcn_global_load_monitor_b128:
1560 case Intrinsic::amdgcn_cluster_load_b32:
1561 case Intrinsic::amdgcn_cluster_load_b64:
1562 case Intrinsic::amdgcn_cluster_load_b128:
1563 case Intrinsic::amdgcn_ds_load_tr6_b96:
1564 case Intrinsic::amdgcn_ds_load_tr4_b64:
1565 case Intrinsic::amdgcn_ds_load_tr8_b64:
1566 case Intrinsic::amdgcn_ds_load_tr16_b128:
1567 case Intrinsic::amdgcn_global_load_tr6_b96:
1568 case Intrinsic::amdgcn_global_load_tr4_b64:
1569 case Intrinsic::amdgcn_global_load_tr_b64:
1570 case Intrinsic::amdgcn_global_load_tr_b128:
1571 case Intrinsic::amdgcn_ds_read_tr4_b64:
1572 case Intrinsic::amdgcn_ds_read_tr6_b96:
1573 case Intrinsic::amdgcn_ds_read_tr8_b64:
1574 case Intrinsic::amdgcn_ds_read_tr16_b64: {
1575 Info.opc = ISD::INTRINSIC_W_CHAIN;
1576 Info.memVT = MVT::getVT(CI.getType());
1577 Info.ptrVal = CI.getOperand(0);
1578 Info.align.reset();
1579 Info.flags |= MachineMemOperand::MOLoad;
1580 return true;
1581 }
1582 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1583 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1584 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
1585 Info.opc = ISD::INTRINSIC_W_CHAIN;
1586 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1587 Info.ptrVal = CI.getOperand(0);
1588 Info.align.reset();
1589 getCoopAtomicOperandsInfo(CI, /*IsLoad=*/true, Info);
1590 return true;
1591 }
1592 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1593 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1594 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
1595 Info.opc = ISD::INTRINSIC_VOID;
1596 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1597 Info.ptrVal = CI.getArgOperand(0);
1598 Info.align.reset();
1599 getCoopAtomicOperandsInfo(CI, /*IsLoad=*/false, Info);
1600 return true;
1601 }
1602 case Intrinsic::amdgcn_ds_gws_init:
1603 case Intrinsic::amdgcn_ds_gws_barrier:
1604 case Intrinsic::amdgcn_ds_gws_sema_v:
1605 case Intrinsic::amdgcn_ds_gws_sema_br:
1606 case Intrinsic::amdgcn_ds_gws_sema_p:
1607 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1608 Info.opc = ISD::INTRINSIC_VOID;
1609
1610 const GCNTargetMachine &TM =
1611 static_cast<const GCNTargetMachine &>(getTargetMachine());
1612
1614 Info.ptrVal = MFI->getGWSPSV(TM);
1615
1616 // This is an abstract access, but we need to specify a type and size.
1617 Info.memVT = MVT::i32;
1618 Info.size = 4;
1619 Info.align = Align(4);
1620
1621 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1622 Info.flags |= MachineMemOperand::MOLoad;
1623 else
1624 Info.flags |= MachineMemOperand::MOStore;
1625 return true;
1626 }
1627 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1628 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1629 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1630 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1631 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1632 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1633 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1634 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: {
1635 Info.opc = ISD::INTRINSIC_VOID;
1636 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1637 Info.ptrVal = CI.getArgOperand(1);
1639 return true;
1640 }
1641 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1642 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1643 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1644 case Intrinsic::amdgcn_global_store_async_from_lds_b128: {
1645 Info.opc = ISD::INTRINSIC_VOID;
1646 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1647 Info.ptrVal = CI.getArgOperand(0);
1649 return true;
1650 }
1651 case Intrinsic::amdgcn_load_to_lds:
1652 case Intrinsic::amdgcn_global_load_lds: {
1653 Info.opc = ISD::INTRINSIC_VOID;
1654 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1655 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1656 Info.ptrVal = CI.getArgOperand(1);
1658 auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
1659 if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
1660 Info.flags |= MachineMemOperand::MOVolatile;
1661 return true;
1662 }
1663 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
1664 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
1665 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
1666 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: {
1667 Info.opc = ISD::INTRINSIC_W_CHAIN;
1668
1669 const GCNTargetMachine &TM =
1670 static_cast<const GCNTargetMachine &>(getTargetMachine());
1671
1673 Info.ptrVal = MFI->getGWSPSV(TM);
1674
1675 // This is an abstract access, but we need to specify a type and size.
1676 Info.memVT = MVT::i32;
1677 Info.size = 4;
1678 Info.align = Align(4);
1679
1681 return true;
1682 }
1683 case Intrinsic::amdgcn_s_prefetch_data:
1684 case Intrinsic::amdgcn_flat_prefetch:
1685 case Intrinsic::amdgcn_global_prefetch: {
1686 Info.opc = ISD::INTRINSIC_VOID;
1687 Info.memVT = EVT::getIntegerVT(CI.getContext(), 8);
1688 Info.ptrVal = CI.getArgOperand(0);
1689 Info.flags |= MachineMemOperand::MOLoad;
1690 return true;
1691 }
1692 default:
1693 return false;
1694 }
1695}
1696
1698 const CallInst &I, SmallVectorImpl<SDValue> &Ops, SelectionDAG &DAG) const {
1700 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1701 // The DAG's ValueType loses the addrspaces.
1702 // Add them as 2 extra Constant operands "from" and "to".
1703 unsigned SrcAS = I.getOperand(0)->getType()->getPointerAddressSpace();
1704 unsigned DstAS = I.getType()->getPointerAddressSpace();
1705 Ops.push_back(DAG.getTargetConstant(SrcAS, SDLoc(), MVT::i32));
1706 Ops.push_back(DAG.getTargetConstant(DstAS, SDLoc(), MVT::i32));
1707 break;
1708 }
1709 default:
1710 break;
1711 }
1712}
1713
1716 Type *&AccessTy) const {
1717 Value *Ptr = nullptr;
1718 switch (II->getIntrinsicID()) {
1719 case Intrinsic::amdgcn_cluster_load_b128:
1720 case Intrinsic::amdgcn_cluster_load_b64:
1721 case Intrinsic::amdgcn_cluster_load_b32:
1722 case Intrinsic::amdgcn_ds_append:
1723 case Intrinsic::amdgcn_ds_consume:
1724 case Intrinsic::amdgcn_ds_load_tr8_b64:
1725 case Intrinsic::amdgcn_ds_load_tr16_b128:
1726 case Intrinsic::amdgcn_ds_load_tr4_b64:
1727 case Intrinsic::amdgcn_ds_load_tr6_b96:
1728 case Intrinsic::amdgcn_ds_read_tr4_b64:
1729 case Intrinsic::amdgcn_ds_read_tr6_b96:
1730 case Intrinsic::amdgcn_ds_read_tr8_b64:
1731 case Intrinsic::amdgcn_ds_read_tr16_b64:
1732 case Intrinsic::amdgcn_ds_ordered_add:
1733 case Intrinsic::amdgcn_ds_ordered_swap:
1734 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1735 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64:
1736 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1737 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1738 case Intrinsic::amdgcn_flat_load_monitor_b128:
1739 case Intrinsic::amdgcn_flat_load_monitor_b32:
1740 case Intrinsic::amdgcn_flat_load_monitor_b64:
1741 case Intrinsic::amdgcn_global_atomic_fmax_num:
1742 case Intrinsic::amdgcn_global_atomic_fmin_num:
1743 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1744 case Intrinsic::amdgcn_global_load_monitor_b128:
1745 case Intrinsic::amdgcn_global_load_monitor_b32:
1746 case Intrinsic::amdgcn_global_load_monitor_b64:
1747 case Intrinsic::amdgcn_global_load_tr_b64:
1748 case Intrinsic::amdgcn_global_load_tr_b128:
1749 case Intrinsic::amdgcn_global_load_tr4_b64:
1750 case Intrinsic::amdgcn_global_load_tr6_b96:
1751 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1752 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1753 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1754 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1755 Ptr = II->getArgOperand(0);
1756 break;
1757 case Intrinsic::amdgcn_load_to_lds:
1758 case Intrinsic::amdgcn_global_load_lds:
1759 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1760 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1761 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1762 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1763 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1764 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1765 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1766 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1767 Ptr = II->getArgOperand(1);
1768 break;
1769 default:
1770 return false;
1771 }
1772 AccessTy = II->getType();
1773 Ops.push_back(Ptr);
1774 return true;
1775}
1776
1778 unsigned AddrSpace) const {
1779 if (!Subtarget->hasFlatInstOffsets()) {
1780 // Flat instructions do not have offsets, and only have the register
1781 // address.
1782 return AM.BaseOffs == 0 && AM.Scale == 0;
1783 }
1784
1785 decltype(SIInstrFlags::FLAT) FlatVariant =
1789
1790 return AM.Scale == 0 &&
1791 (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1792 AM.BaseOffs, AddrSpace, FlatVariant));
1793}
1794
1796 if (Subtarget->hasFlatGlobalInsts())
1798
1799 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1800 // Assume the we will use FLAT for all global memory accesses
1801 // on VI.
1802 // FIXME: This assumption is currently wrong. On VI we still use
1803 // MUBUF instructions for the r + i addressing mode. As currently
1804 // implemented, the MUBUF instructions only work on buffer < 4GB.
1805 // It may be possible to support > 4GB buffers with MUBUF instructions,
1806 // by setting the stride value in the resource descriptor which would
1807 // increase the size limit to (stride * 4GB). However, this is risky,
1808 // because it has never been validated.
1810 }
1811
1812 return isLegalMUBUFAddressingMode(AM);
1813}
1814
1815bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1816 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1817 // additionally can do r + r + i with addr64. 32-bit has more addressing
1818 // mode options. Depending on the resource constant, it can also do
1819 // (i64 r0) + (i32 r1) * (i14 i).
1820 //
1821 // Private arrays end up using a scratch buffer most of the time, so also
1822 // assume those use MUBUF instructions. Scratch loads / stores are currently
1823 // implemented as mubuf instructions with offen bit set, so slightly
1824 // different than the normal addr64.
1825 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1826 if (!TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1827 return false;
1828
1829 // FIXME: Since we can split immediate into soffset and immediate offset,
1830 // would it make sense to allow any immediate?
1831
1832 switch (AM.Scale) {
1833 case 0: // r + i or just i, depending on HasBaseReg.
1834 return true;
1835 case 1:
1836 return true; // We have r + r or r + i.
1837 case 2:
1838 if (AM.HasBaseReg) {
1839 // Reject 2 * r + r.
1840 return false;
1841 }
1842
1843 // Allow 2 * r as r + r
1844 // Or 2 * r + i is allowed as r + r + i.
1845 return true;
1846 default: // Don't allow n * r
1847 return false;
1848 }
1849}
1850
1852 const AddrMode &AM, Type *Ty,
1853 unsigned AS,
1854 Instruction *I) const {
1855 // No global is ever allowed as a base.
1856 if (AM.BaseGV)
1857 return false;
1858
1859 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
1860 return isLegalGlobalAddressingMode(AM);
1861
1862 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
1866 // If the offset isn't a multiple of 4, it probably isn't going to be
1867 // correctly aligned.
1868 // FIXME: Can we get the real alignment here?
1869 if (AM.BaseOffs % 4 != 0)
1870 return isLegalMUBUFAddressingMode(AM);
1871
1872 if (!Subtarget->hasScalarSubwordLoads()) {
1873 // There are no SMRD extloads, so if we have to do a small type access we
1874 // will use a MUBUF load.
1875 // FIXME?: We also need to do this if unaligned, but we don't know the
1876 // alignment here.
1877 if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
1878 return isLegalGlobalAddressingMode(AM);
1879 }
1880
1881 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
1882 // SMRD instructions have an 8-bit, dword offset on SI.
1883 if (!isUInt<8>(AM.BaseOffs / 4))
1884 return false;
1885 } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
1886 // On CI+, this can also be a 32-bit literal constant offset. If it fits
1887 // in 8-bits, it can use a smaller encoding.
1888 if (!isUInt<32>(AM.BaseOffs / 4))
1889 return false;
1890 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX9) {
1891 // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1892 if (!isUInt<20>(AM.BaseOffs))
1893 return false;
1894 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) {
1895 // On GFX9 the offset is signed 21-bit in bytes (but must not be negative
1896 // for S_BUFFER_* instructions).
1897 if (!isInt<21>(AM.BaseOffs))
1898 return false;
1899 } else {
1900 // On GFX12, all offsets are signed 24-bit in bytes.
1901 if (!isInt<24>(AM.BaseOffs))
1902 return false;
1903 }
1904
1905 if ((AS == AMDGPUAS::CONSTANT_ADDRESS ||
1907 AM.BaseOffs < 0) {
1908 // Scalar (non-buffer) loads can only use a negative offset if
1909 // soffset+offset is non-negative. Since the compiler can only prove that
1910 // in a few special cases, it is safer to claim that negative offsets are
1911 // not supported.
1912 return false;
1913 }
1914
1915 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1916 return true;
1917
1918 if (AM.Scale == 1 && AM.HasBaseReg)
1919 return true;
1920
1921 return false;
1922 }
1923
1924 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
1925 return Subtarget->hasFlatScratchEnabled()
1927 : isLegalMUBUFAddressingMode(AM);
1928
1929 if (AS == AMDGPUAS::LOCAL_ADDRESS ||
1930 (AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) {
1931 // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1932 // field.
1933 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1934 // an 8-bit dword offset but we don't know the alignment here.
1935 if (!isUInt<16>(AM.BaseOffs))
1936 return false;
1937
1938 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1939 return true;
1940
1941 if (AM.Scale == 1 && AM.HasBaseReg)
1942 return true;
1943
1944 return false;
1945 }
1946
1948 // For an unknown address space, this usually means that this is for some
1949 // reason being used for pure arithmetic, and not based on some addressing
1950 // computation. We don't have instructions that compute pointers with any
1951 // addressing modes, so treat them as having no offset like flat
1952 // instructions.
1954 }
1955
1956 // Assume a user alias of global for unknown address spaces.
1957 return isLegalGlobalAddressingMode(AM);
1958}
1959
1961 const MachineFunction &MF) const {
1963 return (MemVT.getSizeInBits() <= 4 * 32);
1964 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1965 unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
1966 return (MemVT.getSizeInBits() <= MaxPrivateBits);
1967 }
1969 return (MemVT.getSizeInBits() <= 2 * 32);
1970 return true;
1971}
1972
1974 unsigned Size, unsigned AddrSpace, Align Alignment,
1975 MachineMemOperand::Flags Flags, unsigned *IsFast) const {
1976 if (IsFast)
1977 *IsFast = 0;
1978
1979 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1980 AddrSpace == AMDGPUAS::REGION_ADDRESS) {
1981 // Check if alignment requirements for ds_read/write instructions are
1982 // disabled.
1983 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4))
1984 return false;
1985
1986 Align RequiredAlignment(
1987 PowerOf2Ceil(divideCeil(Size, 8))); // Natural alignment.
1988 if (Subtarget->hasLDSMisalignedBugInWGPMode() && Size > 32 &&
1989 Alignment < RequiredAlignment)
1990 return false;
1991
1992 // Either, the alignment requirements are "enabled", or there is an
1993 // unaligned LDS access related hardware bug though alignment requirements
1994 // are "disabled". In either case, we need to check for proper alignment
1995 // requirements.
1996 //
1997 switch (Size) {
1998 case 64:
1999 // SI has a hardware bug in the LDS / GDS bounds checking: if the base
2000 // address is negative, then the instruction is incorrectly treated as
2001 // out-of-bounds even if base + offsets is in bounds. Split vectorized
2002 // loads here to avoid emitting ds_read2_b32. We may re-combine the
2003 // load later in the SILoadStoreOptimizer.
2004 if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8))
2005 return false;
2006
2007 // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
2008 // can do a 4 byte aligned, 8 byte access in a single operation using
2009 // ds_read2/write2_b32 with adjacent offsets.
2010 RequiredAlignment = Align(4);
2011
2012 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2013 // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/
2014 // ds_write2_b32 depending on the alignment. In either case with either
2015 // alignment there is no faster way of doing this.
2016
2017 // The numbers returned here and below are not additive, it is a 'speed
2018 // rank'. They are just meant to be compared to decide if a certain way
2019 // of lowering an operation is faster than another. For that purpose
2020 // naturally aligned operation gets it bitsize to indicate that "it
2021 // operates with a speed comparable to N-bit wide load". With the full
2022 // alignment ds128 is slower than ds96 for example. If underaligned it
2023 // is comparable to a speed of a single dword access, which would then
2024 // mean 32 < 128 and it is faster to issue a wide load regardless.
2025 // 1 is simply "slow, don't do it". I.e. comparing an aligned load to a
2026 // wider load which will not be aligned anymore the latter is slower.
2027 if (IsFast)
2028 *IsFast = (Alignment >= RequiredAlignment) ? 64
2029 : (Alignment < Align(4)) ? 32
2030 : 1;
2031 return true;
2032 }
2033
2034 break;
2035 case 96:
2036 if (!Subtarget->hasDS96AndDS128())
2037 return false;
2038
2039 // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
2040 // gfx8 and older.
2041
2042 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2043 // Naturally aligned access is fastest. However, also report it is Fast
2044 // if memory is aligned less than DWORD. A narrow load or store will be
2045 // be equally slow as a single ds_read_b96/ds_write_b96, but there will
2046 // be more of them, so overall we will pay less penalty issuing a single
2047 // instruction.
2048
2049 // See comment on the values above.
2050 if (IsFast)
2051 *IsFast = (Alignment >= RequiredAlignment) ? 96
2052 : (Alignment < Align(4)) ? 32
2053 : 1;
2054 return true;
2055 }
2056
2057 break;
2058 case 128:
2059 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
2060 return false;
2061
2062 // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
2063 // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a
2064 // single operation using ds_read2/write2_b64.
2065 RequiredAlignment = Align(8);
2066
2067 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2068 // Naturally aligned access is fastest. However, also report it is Fast
2069 // if memory is aligned less than DWORD. A narrow load or store will be
2070 // be equally slow as a single ds_read_b128/ds_write_b128, but there
2071 // will be more of them, so overall we will pay less penalty issuing a
2072 // single instruction.
2073
2074 // See comment on the values above.
2075 if (IsFast)
2076 *IsFast = (Alignment >= RequiredAlignment) ? 128
2077 : (Alignment < Align(4)) ? 32
2078 : 1;
2079 return true;
2080 }
2081
2082 break;
2083 default:
2084 if (Size > 32)
2085 return false;
2086
2087 break;
2088 }
2089
2090 // See comment on the values above.
2091 // Note that we have a single-dword or sub-dword here, so if underaligned
2092 // it is a slowest possible access, hence returned value is 0.
2093 if (IsFast)
2094 *IsFast = (Alignment >= RequiredAlignment) ? Size : 0;
2095
2096 return Alignment >= RequiredAlignment ||
2097 Subtarget->hasUnalignedDSAccessEnabled();
2098 }
2099
2100 // FIXME: We have to be conservative here and assume that flat operations
2101 // will access scratch. If we had access to the IR function, then we
2102 // could determine if any private memory was used in the function.
2103 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
2104 AddrSpace == AMDGPUAS::FLAT_ADDRESS) {
2105 bool AlignedBy4 = Alignment >= Align(4);
2106 if (Subtarget->hasUnalignedScratchAccessEnabled()) {
2107 if (IsFast)
2108 *IsFast = AlignedBy4 ? Size : 1;
2109 return true;
2110 }
2111
2112 if (IsFast)
2113 *IsFast = AlignedBy4;
2114
2115 return AlignedBy4;
2116 }
2117
2118 // So long as they are correct, wide global memory operations perform better
2119 // than multiple smaller memory ops -- even when misaligned
2120 if (AMDGPU::isExtendedGlobalAddrSpace(AddrSpace)) {
2121 if (IsFast)
2122 *IsFast = Size;
2123
2124 return Alignment >= Align(4) ||
2125 Subtarget->hasUnalignedBufferAccessEnabled();
2126 }
2127
2128 // Ensure robust out-of-bounds guarantees for buffer accesses are met if
2129 // RelaxedBufferOOBMode is disabled. Normally hardware will ensure proper
2130 // out-of-bounds behavior, but in the edge case where an access starts
2131 // out-of-bounds and then enter in-bounds, the entire access would be treated
2132 // as out-of-bounds. Prevent misaligned memory accesses by requiring the
2133 // natural alignment of buffer accesses.
2134 if (AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER ||
2135 AddrSpace == AMDGPUAS::BUFFER_RESOURCE ||
2136 AddrSpace == AMDGPUAS::BUFFER_STRIDED_POINTER) {
2137 if (!Subtarget->hasRelaxedBufferOOBMode() &&
2138 Alignment < Align(PowerOf2Ceil(divideCeil(Size, 8))))
2139 return false;
2140 }
2141
2142 // Smaller than dword value must be aligned.
2143 if (Size < 32)
2144 return false;
2145
2146 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
2147 // byte-address are ignored, thus forcing Dword alignment.
2148 // This applies to private, global, and constant memory.
2149 if (IsFast)
2150 *IsFast = 1;
2151
2152 return Size >= 32 && Alignment >= Align(4);
2153}
2154
2156 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2157 unsigned *IsFast) const {
2159 Alignment, Flags, IsFast);
2160}
2161
2163 LLVMContext &Context, const MemOp &Op,
2164 const AttributeList &FuncAttributes) const {
2165 // FIXME: Should account for address space here.
2166
2167 // The default fallback uses the private pointer size as a guess for a type to
2168 // use. Make sure we switch these to 64-bit accesses.
2169
2170 if (Op.size() >= 16 &&
2171 Op.isDstAligned(Align(4))) // XXX: Should only do for global
2172 return MVT::v4i32;
2173
2174 if (Op.size() >= 8 && Op.isDstAligned(Align(4)))
2175 return MVT::v2i32;
2176
2177 // Use the default.
2178 return MVT::Other;
2179}
2180
2182 const MemSDNode *MemNode = cast<MemSDNode>(N);
2183 return MemNode->getMemOperand()->getFlags() & MONoClobber;
2184}
2185
2190
2192 unsigned DestAS) const {
2193 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
2194 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
2195 Subtarget->hasGloballyAddressableScratch()) {
2196 // Flat -> private requires subtracting src_flat_scratch_base_lo.
2197 return false;
2198 }
2199
2200 // Flat -> private/local is a simple truncate.
2201 // Flat -> global is no-op
2202 return true;
2203 }
2204
2205 const GCNTargetMachine &TM =
2206 static_cast<const GCNTargetMachine &>(getTargetMachine());
2207 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
2208}
2209
2217
2219 Type *Ty) const {
2220 // FIXME: Could be smarter if called for vector constants.
2221 return true;
2222}
2223
2225 unsigned Index) const {
2227 return false;
2228
2229 // TODO: Add more cases that are cheap.
2230 return Index == 0;
2231}
2232
2233bool SITargetLowering::isExtractVecEltCheap(EVT VT, unsigned Index) const {
2234 // TODO: This should be more aggressive, particular for 16-bit element
2235 // vectors. However there are some mixed improvements and regressions.
2236 EVT EltTy = VT.getVectorElementType();
2237 unsigned MinAlign = Subtarget->useRealTrue16Insts() ? 16 : 32;
2238 return EltTy.getSizeInBits() % MinAlign == 0;
2239}
2240
2242 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
2243 switch (Op) {
2244 case ISD::LOAD:
2245 case ISD::STORE:
2246 return true;
2247 default:
2248 return false;
2249 }
2250 }
2251
2252 // SimplifySetCC uses this function to determine whether or not it should
2253 // create setcc with i1 operands. We don't have instructions for i1 setcc.
2254 if (VT == MVT::i1 && Op == ISD::SETCC)
2255 return false;
2256
2258}
2259
2262 // This isn't really a constant pool but close enough.
2265 return PtrInfo;
2266}
2267
2268SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
2269 const SDLoc &SL,
2270 SDValue Chain,
2271 uint64_t Offset) const {
2272 const DataLayout &DL = DAG.getDataLayout();
2276
2277 auto [InputPtrReg, RC, ArgTy] =
2278 Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2279
2280 // We may not have the kernarg segment argument if we have no kernel
2281 // arguments.
2282 if (!InputPtrReg)
2283 return DAG.getConstant(Offset, SL, PtrVT);
2284
2286 SDValue BasePtr = DAG.getCopyFromReg(
2287 Chain, SL, MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
2288
2289 return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Offset));
2290}
2291
2292SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
2293 const SDLoc &SL) const {
2296 return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
2297}
2298
2299SDValue SITargetLowering::getLDSKernelId(SelectionDAG &DAG,
2300 const SDLoc &SL) const {
2301
2303 std::optional<uint32_t> KnownSize =
2305 if (KnownSize.has_value())
2306 return DAG.getConstant(*KnownSize, SL, MVT::i32);
2307 return SDValue();
2308}
2309
2310SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
2311 const SDLoc &SL, SDValue Val,
2312 bool Signed,
2313 const ISD::InputArg *Arg) const {
2314 // First, if it is a widened vector, narrow it.
2315 if (VT.isVector() &&
2317 EVT NarrowedVT =
2320 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val,
2321 DAG.getConstant(0, SL, MVT::i32));
2322 }
2323
2324 // Then convert the vector elements or scalar value.
2325 if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) && VT.bitsLT(MemVT)) {
2326 unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
2327 Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
2328 }
2329
2330 if (MemVT.isFloatingPoint()) {
2331 if (VT.isFloatingPoint()) {
2332 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2333 } else {
2334 assert(!MemVT.isVector());
2335 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
2336 SDValue Cast = DAG.getBitcast(IntVT, Val);
2337 Val = DAG.getAnyExtOrTrunc(Cast, SL, VT);
2338 }
2339 } else if (Signed)
2340 Val = DAG.getSExtOrTrunc(Val, SL, VT);
2341 else
2342 Val = DAG.getZExtOrTrunc(Val, SL, VT);
2343
2344 return Val;
2345}
2346
2347SDValue SITargetLowering::lowerKernargMemParameter(
2348 SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
2349 uint64_t Offset, Align Alignment, bool Signed,
2350 const ISD::InputArg *Arg) const {
2351
2352 MachinePointerInfo PtrInfo =
2354
2355 // Try to avoid using an extload by loading earlier than the argument address,
2356 // and extracting the relevant bits. The load should hopefully be merged with
2357 // the previous argument.
2358 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2359 // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
2360 int64_t AlignDownOffset = alignDown(Offset, 4);
2361 int64_t OffsetDiff = Offset - AlignDownOffset;
2362
2363 EVT IntVT = MemVT.changeTypeToInteger();
2364
2365 // TODO: If we passed in the base kernel offset we could have a better
2366 // alignment than 4, but we don't really need it.
2367 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2368 SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr,
2369 PtrInfo.getWithOffset(AlignDownOffset), Align(4),
2372
2373 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
2374 SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
2375
2376 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
2377 ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
2378 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
2379
2380 return DAG.getMergeValues({ArgVal, Load.getValue(1)}, SL);
2381 }
2382
2383 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
2384 SDValue Load = DAG.getLoad(
2385 MemVT, SL, Chain, Ptr, PtrInfo.getWithOffset(Offset), Alignment,
2387
2388 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
2389 return DAG.getMergeValues({Val, Load.getValue(1)}, SL);
2390}
2391
2392/// Coerce an argument which was passed in a different ABI type to the original
2393/// expected value type.
2394SDValue SITargetLowering::convertABITypeToValueType(SelectionDAG &DAG,
2395 SDValue Val,
2396 CCValAssign &VA,
2397 const SDLoc &SL) const {
2398 EVT ValVT = VA.getValVT();
2399
2400 // If this is an 8 or 16-bit value, it is really passed promoted
2401 // to 32 bits. Insert an assert[sz]ext to capture this, then
2402 // truncate to the right size.
2403 switch (VA.getLocInfo()) {
2404 case CCValAssign::Full:
2405 return Val;
2406 case CCValAssign::BCvt:
2407 return DAG.getNode(ISD::BITCAST, SL, ValVT, Val);
2408 case CCValAssign::SExt:
2409 Val = DAG.getNode(ISD::AssertSext, SL, VA.getLocVT(), Val,
2410 DAG.getValueType(ValVT));
2411 return DAG.getNode(ISD::TRUNCATE, SL, ValVT, Val);
2412 case CCValAssign::ZExt:
2413 Val = DAG.getNode(ISD::AssertZext, SL, VA.getLocVT(), Val,
2414 DAG.getValueType(ValVT));
2415 return DAG.getNode(ISD::TRUNCATE, SL, ValVT, Val);
2416 case CCValAssign::AExt:
2417 return DAG.getNode(ISD::TRUNCATE, SL, ValVT, Val);
2418 default:
2419 llvm_unreachable("Unknown loc info!");
2420 }
2421}
2422
2423SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG,
2424 CCValAssign &VA, const SDLoc &SL,
2425 SDValue Chain,
2426 const ISD::InputArg &Arg) const {
2427 MachineFunction &MF = DAG.getMachineFunction();
2428 MachineFrameInfo &MFI = MF.getFrameInfo();
2429
2430 if (Arg.Flags.isByVal()) {
2431 unsigned Size = Arg.Flags.getByValSize();
2432 int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
2433 return DAG.getFrameIndex(FrameIdx, MVT::i32);
2434 }
2435
2436 unsigned ArgOffset = VA.getLocMemOffset();
2437 unsigned ArgSize = VA.getValVT().getStoreSize();
2438
2439 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
2440
2441 // Create load nodes to retrieve arguments from the stack.
2442 SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
2443
2444 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
2446 MVT MemVT = VA.getValVT();
2447
2448 switch (VA.getLocInfo()) {
2449 default:
2450 break;
2451 case CCValAssign::BCvt:
2452 MemVT = VA.getLocVT();
2453 break;
2454 case CCValAssign::SExt:
2455 ExtType = ISD::SEXTLOAD;
2456 break;
2457 case CCValAssign::ZExt:
2458 ExtType = ISD::ZEXTLOAD;
2459 break;
2460 case CCValAssign::AExt:
2461 ExtType = ISD::EXTLOAD;
2462 break;
2463 }
2464
2465 SDValue ArgValue = DAG.getExtLoad(
2466 ExtType, SL, VA.getLocVT(), Chain, FIN,
2468
2469 SDValue ConvertedVal = convertABITypeToValueType(DAG, ArgValue, VA, SL);
2470 if (ConvertedVal == ArgValue)
2471 return ConvertedVal;
2472
2473 return DAG.getMergeValues({ConvertedVal, ArgValue.getValue(1)}, SL);
2474}
2475
2476SDValue SITargetLowering::lowerWorkGroupId(
2477 SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
2480 AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const {
2481 if (!Subtarget->hasClusters())
2482 return getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2483
2484 // Clusters are supported. Return the global position in the grid. If clusters
2485 // are enabled, WorkGroupIdPV returns the cluster ID not the workgroup ID.
2486
2487 // WorkGroupIdXYZ = ClusterId == 0 ?
2488 // ClusterIdXYZ :
2489 // ClusterIdXYZ * (ClusterMaxIdXYZ + 1) + ClusterWorkGroupIdXYZ
2490 SDValue ClusterIdXYZ = getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2491 SDLoc SL(ClusterIdXYZ);
2492 SDValue ClusterMaxIdXYZ = getPreloadedValue(DAG, MFI, VT, ClusterMaxIdPV);
2493 SDValue One = DAG.getConstant(1, SL, VT);
2494 SDValue ClusterSizeXYZ = DAG.getNode(ISD::ADD, SL, VT, ClusterMaxIdXYZ, One);
2495 SDValue ClusterWorkGroupIdXYZ =
2496 getPreloadedValue(DAG, MFI, VT, ClusterWorkGroupIdPV);
2497 SDValue GlobalIdXYZ =
2498 DAG.getNode(ISD::ADD, SL, VT, ClusterWorkGroupIdXYZ,
2499 DAG.getNode(ISD::MUL, SL, VT, ClusterIdXYZ, ClusterSizeXYZ));
2500
2501 switch (MFI.getClusterDims().getKind()) {
2504 return GlobalIdXYZ;
2506 return ClusterIdXYZ;
2508 using namespace AMDGPU::Hwreg;
2509 SDValue ClusterIdField =
2510 DAG.getTargetConstant(HwregEncoding::encode(ID_IB_STS2, 6, 4), SL, VT);
2511 SDNode *GetReg =
2512 DAG.getMachineNode(AMDGPU::S_GETREG_B32_const, SL, VT, ClusterIdField);
2513 SDValue ClusterId(GetReg, 0);
2514 SDValue Zero = DAG.getConstant(0, SL, VT);
2515 return DAG.getNode(ISD::SELECT_CC, SL, VT, ClusterId, Zero, ClusterIdXYZ,
2516 GlobalIdXYZ, DAG.getCondCode(ISD::SETEQ));
2517 }
2518 }
2519
2520 llvm_unreachable("nothing should reach here");
2521}
2522
2523SDValue SITargetLowering::getPreloadedValue(
2524 SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
2526 const ArgDescriptor *Reg = nullptr;
2527 const TargetRegisterClass *RC;
2528 LLT Ty;
2529
2531 const ArgDescriptor WorkGroupIDX =
2532 ArgDescriptor::createRegister(AMDGPU::TTMP9);
2533 // If GridZ is not programmed in an entry function then the hardware will set
2534 // it to all zeros, so there is no need to mask the GridY value in the low
2535 // order bits.
2536 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
2537 AMDGPU::TTMP7,
2538 AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
2539 const ArgDescriptor WorkGroupIDZ =
2540 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
2541 const ArgDescriptor ClusterWorkGroupIDX =
2542 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000000Fu);
2543 const ArgDescriptor ClusterWorkGroupIDY =
2544 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000000F0u);
2545 const ArgDescriptor ClusterWorkGroupIDZ =
2546 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00000F00u);
2547 const ArgDescriptor ClusterWorkGroupMaxIDX =
2548 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000F000u);
2549 const ArgDescriptor ClusterWorkGroupMaxIDY =
2550 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000F0000u);
2551 const ArgDescriptor ClusterWorkGroupMaxIDZ =
2552 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00F00000u);
2553 const ArgDescriptor ClusterWorkGroupMaxFlatID =
2554 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0F000000u);
2555
2556 auto LoadConstant = [&](unsigned N) {
2557 return DAG.getConstant(N, SDLoc(), VT);
2558 };
2559
2560 if (Subtarget->hasArchitectedSGPRs() &&
2562 AMDGPU::ClusterDimsAttr ClusterDims = MFI.getClusterDims();
2563 bool HasFixedDims = ClusterDims.isFixedDims();
2564
2565 switch (PVID) {
2567 Reg = &WorkGroupIDX;
2568 RC = &AMDGPU::SReg_32RegClass;
2569 Ty = LLT::scalar(32);
2570 break;
2572 Reg = &WorkGroupIDY;
2573 RC = &AMDGPU::SReg_32RegClass;
2574 Ty = LLT::scalar(32);
2575 break;
2577 Reg = &WorkGroupIDZ;
2578 RC = &AMDGPU::SReg_32RegClass;
2579 Ty = LLT::scalar(32);
2580 break;
2582 if (HasFixedDims && ClusterDims.getDims()[0] == 1)
2583 return LoadConstant(0);
2584 Reg = &ClusterWorkGroupIDX;
2585 RC = &AMDGPU::SReg_32RegClass;
2586 Ty = LLT::scalar(32);
2587 break;
2589 if (HasFixedDims && ClusterDims.getDims()[1] == 1)
2590 return LoadConstant(0);
2591 Reg = &ClusterWorkGroupIDY;
2592 RC = &AMDGPU::SReg_32RegClass;
2593 Ty = LLT::scalar(32);
2594 break;
2596 if (HasFixedDims && ClusterDims.getDims()[2] == 1)
2597 return LoadConstant(0);
2598 Reg = &ClusterWorkGroupIDZ;
2599 RC = &AMDGPU::SReg_32RegClass;
2600 Ty = LLT::scalar(32);
2601 break;
2603 if (HasFixedDims)
2604 return LoadConstant(ClusterDims.getDims()[0] - 1);
2605 Reg = &ClusterWorkGroupMaxIDX;
2606 RC = &AMDGPU::SReg_32RegClass;
2607 Ty = LLT::scalar(32);
2608 break;
2610 if (HasFixedDims)
2611 return LoadConstant(ClusterDims.getDims()[1] - 1);
2612 Reg = &ClusterWorkGroupMaxIDY;
2613 RC = &AMDGPU::SReg_32RegClass;
2614 Ty = LLT::scalar(32);
2615 break;
2617 if (HasFixedDims)
2618 return LoadConstant(ClusterDims.getDims()[2] - 1);
2619 Reg = &ClusterWorkGroupMaxIDZ;
2620 RC = &AMDGPU::SReg_32RegClass;
2621 Ty = LLT::scalar(32);
2622 break;
2624 Reg = &ClusterWorkGroupMaxFlatID;
2625 RC = &AMDGPU::SReg_32RegClass;
2626 Ty = LLT::scalar(32);
2627 break;
2628 default:
2629 break;
2630 }
2631 }
2632
2633 if (!Reg)
2634 std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
2635 if (!Reg) {
2637 // It's possible for a kernarg intrinsic call to appear in a kernel with
2638 // no allocated segment, in which case we do not add the user sgpr
2639 // argument, so just return null.
2640 return DAG.getConstant(0, SDLoc(), VT);
2641 }
2642
2643 // It's undefined behavior if a function marked with the amdgpu-no-*
2644 // attributes uses the corresponding intrinsic.
2645 return DAG.getPOISON(VT);
2646 }
2647
2648 return loadInputValue(DAG, RC, VT, SDLoc(DAG.getEntryNode()), *Reg);
2649}
2650
2652 CallingConv::ID CallConv,
2653 ArrayRef<ISD::InputArg> Ins, BitVector &Skipped,
2654 FunctionType *FType,
2655 SIMachineFunctionInfo *Info) {
2656 for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
2657 const ISD::InputArg *Arg = &Ins[I];
2658
2659 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2660 "vector type argument should have been split");
2661
2662 // First check if it's a PS input addr.
2663 if (CallConv == CallingConv::AMDGPU_PS && !Arg->Flags.isInReg() &&
2664 PSInputNum <= 15) {
2665 bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
2666
2667 // Inconveniently only the first part of the split is marked as isSplit,
2668 // so skip to the end. We only want to increment PSInputNum once for the
2669 // entire split argument.
2670 if (Arg->Flags.isSplit()) {
2671 while (!Arg->Flags.isSplitEnd()) {
2672 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2673 "unexpected vector split in ps argument type");
2674 if (!SkipArg)
2675 Splits.push_back(*Arg);
2676 Arg = &Ins[++I];
2677 }
2678 }
2679
2680 if (SkipArg) {
2681 // We can safely skip PS inputs.
2682 Skipped.set(Arg->getOrigArgIndex());
2683 ++PSInputNum;
2684 continue;
2685 }
2686
2687 Info->markPSInputAllocated(PSInputNum);
2688 if (Arg->Used)
2689 Info->markPSInputEnabled(PSInputNum);
2690
2691 ++PSInputNum;
2692 }
2693
2694 Splits.push_back(*Arg);
2695 }
2696}
2697
2698// Allocate special inputs passed in VGPRs.
2700 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2701 SIMachineFunctionInfo &Info) const {
2702 const LLT S32 = LLT::scalar(32);
2704
2705 if (Info.hasWorkItemIDX()) {
2706 Register Reg = AMDGPU::VGPR0;
2707 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2708
2709 CCInfo.AllocateReg(Reg);
2710 unsigned Mask =
2711 (Subtarget->hasPackedTID() && Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2712 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2713 }
2714
2715 if (Info.hasWorkItemIDY()) {
2716 assert(Info.hasWorkItemIDX());
2717 if (Subtarget->hasPackedTID()) {
2718 Info.setWorkItemIDY(
2719 ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 10));
2720 } else {
2721 unsigned Reg = AMDGPU::VGPR1;
2722 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2723
2724 CCInfo.AllocateReg(Reg);
2725 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
2726 }
2727 }
2728
2729 if (Info.hasWorkItemIDZ()) {
2730 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2731 if (Subtarget->hasPackedTID()) {
2732 Info.setWorkItemIDZ(
2733 ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 20));
2734 } else {
2735 unsigned Reg = AMDGPU::VGPR2;
2736 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2737
2738 CCInfo.AllocateReg(Reg);
2739 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
2740 }
2741 }
2742}
2743
2744// Try to allocate a VGPR at the end of the argument list, or if no argument
2745// VGPRs are left allocating a stack slot.
2746// If \p Mask is is given it indicates bitfield position in the register.
2747// If \p Arg is given use it with new ]p Mask instead of allocating new.
2748static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
2749 ArgDescriptor Arg = ArgDescriptor()) {
2750 if (Arg.isSet())
2751 return ArgDescriptor::createArg(Arg, Mask);
2752
2753 ArrayRef<MCPhysReg> ArgVGPRs = ArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
2754 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
2755 if (RegIdx == ArgVGPRs.size()) {
2756 // Spill to stack required.
2757 int64_t Offset = CCInfo.AllocateStack(4, Align(4));
2758
2759 return ArgDescriptor::createStack(Offset, Mask);
2760 }
2761
2762 unsigned Reg = ArgVGPRs[RegIdx];
2763 Reg = CCInfo.AllocateReg(Reg);
2764 assert(Reg != AMDGPU::NoRegister);
2765
2766 MachineFunction &MF = CCInfo.getMachineFunction();
2767 Register LiveInVReg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
2768 MF.getRegInfo().setType(LiveInVReg, LLT::scalar(32));
2769 return ArgDescriptor::createRegister(Reg, Mask);
2770}
2771
2773 const TargetRegisterClass *RC,
2774 unsigned NumArgRegs) {
2775 ArrayRef<MCPhysReg> ArgSGPRs = ArrayRef(RC->begin(), 32);
2776 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
2777 if (RegIdx == ArgSGPRs.size())
2778 report_fatal_error("ran out of SGPRs for arguments");
2779
2780 unsigned Reg = ArgSGPRs[RegIdx];
2781 Reg = CCInfo.AllocateReg(Reg);
2782 assert(Reg != AMDGPU::NoRegister);
2783
2784 MachineFunction &MF = CCInfo.getMachineFunction();
2785 MF.addLiveIn(Reg, RC);
2787}
2788
2789// If this has a fixed position, we still should allocate the register in the
2790// CCInfo state. Technically we could get away with this for values passed
2791// outside of the normal argument range.
2793 const TargetRegisterClass *RC,
2794 MCRegister Reg) {
2795 Reg = CCInfo.AllocateReg(Reg);
2796 assert(Reg != AMDGPU::NoRegister);
2797 MachineFunction &MF = CCInfo.getMachineFunction();
2798 MF.addLiveIn(Reg, RC);
2799}
2800
2801static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg) {
2802 if (Arg) {
2803 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_32RegClass,
2804 Arg.getRegister());
2805 } else
2806 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
2807}
2808
2809static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg) {
2810 if (Arg) {
2811 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_64RegClass,
2812 Arg.getRegister());
2813 } else
2814 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
2815}
2816
2817/// Allocate implicit function VGPR arguments at the end of allocated user
2818/// arguments.
2820 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2821 SIMachineFunctionInfo &Info) const {
2822 const unsigned Mask = 0x3ff;
2823 ArgDescriptor Arg;
2824
2825 if (Info.hasWorkItemIDX()) {
2826 Arg = allocateVGPR32Input(CCInfo, Mask);
2827 Info.setWorkItemIDX(Arg);
2828 }
2829
2830 if (Info.hasWorkItemIDY()) {
2831 Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg);
2832 Info.setWorkItemIDY(Arg);
2833 }
2834
2835 if (Info.hasWorkItemIDZ())
2836 Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg));
2837}
2838
2839/// Allocate implicit function VGPR arguments in fixed registers.
2841 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2842 SIMachineFunctionInfo &Info) const {
2843 Register Reg = CCInfo.AllocateReg(AMDGPU::VGPR31);
2844 if (!Reg)
2845 report_fatal_error("failed to allocate VGPR for implicit arguments");
2846
2847 const unsigned Mask = 0x3ff;
2848 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2849 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask << 10));
2850 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask << 20));
2851}
2852
2854 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2855 SIMachineFunctionInfo &Info) const {
2856 auto &ArgInfo = Info.getArgInfo();
2857 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2858
2859 // TODO: Unify handling with private memory pointers.
2860 if (UserSGPRInfo.hasDispatchPtr())
2861 allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);
2862
2863 if (UserSGPRInfo.hasQueuePtr())
2864 allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
2865
2866 // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
2867 // constant offset from the kernarg segment.
2868 if (Info.hasImplicitArgPtr())
2869 allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr);
2870
2871 if (UserSGPRInfo.hasDispatchID())
2872 allocateSGPR64Input(CCInfo, ArgInfo.DispatchID);
2873
2874 // flat_scratch_init is not applicable for non-kernel functions.
2875
2876 if (Info.hasWorkGroupIDX())
2877 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX);
2878
2879 if (Info.hasWorkGroupIDY())
2880 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY);
2881
2882 if (Info.hasWorkGroupIDZ())
2883 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ);
2884
2885 if (Info.hasLDSKernelId())
2886 allocateSGPR32Input(CCInfo, ArgInfo.LDSKernelId);
2887}
2888
2889// Allocate special inputs passed in user SGPRs.
2891 MachineFunction &MF,
2892 const SIRegisterInfo &TRI,
2893 SIMachineFunctionInfo &Info) const {
2894 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2895 if (UserSGPRInfo.hasImplicitBufferPtr()) {
2896 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
2897 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2898 CCInfo.AllocateReg(ImplicitBufferPtrReg);
2899 }
2900
2901 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
2902 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
2903 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
2904 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2905 CCInfo.AllocateReg(PrivateSegmentBufferReg);
2906 }
2907
2908 if (UserSGPRInfo.hasDispatchPtr()) {
2909 Register DispatchPtrReg = Info.addDispatchPtr(TRI);
2910 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2911 CCInfo.AllocateReg(DispatchPtrReg);
2912 }
2913
2914 if (UserSGPRInfo.hasQueuePtr()) {
2915 Register QueuePtrReg = Info.addQueuePtr(TRI);
2916 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2917 CCInfo.AllocateReg(QueuePtrReg);
2918 }
2919
2920 if (UserSGPRInfo.hasKernargSegmentPtr()) {
2922 Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
2923 CCInfo.AllocateReg(InputPtrReg);
2924
2925 Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
2926 MRI.setType(VReg, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2927 }
2928
2929 if (UserSGPRInfo.hasDispatchID()) {
2930 Register DispatchIDReg = Info.addDispatchID(TRI);
2931 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2932 CCInfo.AllocateReg(DispatchIDReg);
2933 }
2934
2935 if (UserSGPRInfo.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
2936 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
2937 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2938 CCInfo.AllocateReg(FlatScratchInitReg);
2939 }
2940
2941 if (UserSGPRInfo.hasPrivateSegmentSize()) {
2942 Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(TRI);
2943 MF.addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
2944 CCInfo.AllocateReg(PrivateSegmentSizeReg);
2945 }
2946
2947 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
2948 // these from the dispatch pointer.
2949}
2950
2951// Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
2952// sequential starting from the first argument.
2954 CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
2956 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2957 Function &F = MF.getFunction();
2958 unsigned LastExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
2959 GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
2960 bool InPreloadSequence = true;
2961 unsigned InIdx = 0;
2962 bool AlignedForImplictArgs = false;
2963 unsigned ImplicitArgOffset = 0;
2964 for (auto &Arg : F.args()) {
2965 if (!InPreloadSequence || !Arg.hasInRegAttr())
2966 break;
2967
2968 unsigned ArgIdx = Arg.getArgNo();
2969 // Don't preload non-original args or parts not in the current preload
2970 // sequence.
2971 if (InIdx < Ins.size() &&
2972 (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
2973 break;
2974
2975 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2976 Ins[InIdx].getOrigArgIndex() == ArgIdx;
2977 InIdx++) {
2978 assert(ArgLocs[ArgIdx].isMemLoc());
2979 auto &ArgLoc = ArgLocs[InIdx];
2980 const Align KernelArgBaseAlign = Align(16);
2981 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2982 Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);
2983 unsigned NumAllocSGPRs =
2984 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2985
2986 // Fix alignment for hidden arguments.
2987 if (Arg.hasAttribute("amdgpu-hidden-argument")) {
2988 if (!AlignedForImplictArgs) {
2989 ImplicitArgOffset =
2990 alignTo(LastExplicitArgOffset,
2991 Subtarget->getAlignmentForImplicitArgPtr()) -
2992 LastExplicitArgOffset;
2993 AlignedForImplictArgs = true;
2994 }
2995 ArgOffset += ImplicitArgOffset;
2996 }
2997
2998 // Arg is preloaded into the previous SGPR.
2999 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
3000 assert(InIdx >= 1 && "No previous SGPR");
3001 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
3002 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
3003 continue;
3004 }
3005
3006 unsigned Padding = ArgOffset - LastExplicitArgOffset;
3007 unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
3008 // Check for free user SGPRs for preloading.
3009 if (PaddingSGPRs + NumAllocSGPRs > SGPRInfo.getNumFreeUserSGPRs()) {
3010 InPreloadSequence = false;
3011 break;
3012 }
3013
3014 // Preload this argument.
3015 const TargetRegisterClass *RC =
3016 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
3017 SmallVectorImpl<MCRegister> *PreloadRegs =
3018 Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
3019
3020 if (PreloadRegs->size() > 1)
3021 RC = &AMDGPU::SGPR_32RegClass;
3022 for (auto &Reg : *PreloadRegs) {
3023 assert(Reg);
3024 MF.addLiveIn(Reg, RC);
3025 CCInfo.AllocateReg(Reg);
3026 }
3027
3028 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
3029 }
3030 }
3031}
3032
3034 const SIRegisterInfo &TRI,
3035 SIMachineFunctionInfo &Info) const {
3036 // Always allocate this last since it is a synthetic preload.
3037 if (Info.hasLDSKernelId()) {
3038 Register Reg = Info.addLDSKernelId();
3039 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3040 CCInfo.AllocateReg(Reg);
3041 }
3042}
3043
3044// Allocate special input registers that are initialized per-wave.
3047 CallingConv::ID CallConv,
3048 bool IsShader) const {
3049 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
3050 if (Subtarget->hasUserSGPRInit16BugInWave32() && !IsShader) {
3051 // Note: user SGPRs are handled by the front-end for graphics shaders
3052 // Pad up the used user SGPRs with dead inputs.
3053
3054 // TODO: NumRequiredSystemSGPRs computation should be adjusted appropriately
3055 // before enabling architected SGPRs for workgroup IDs.
3056 assert(!HasArchitectedSGPRs && "Unhandled feature for the subtarget");
3057
3058 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
3059 // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to
3060 // rely on it to reach 16 since if we end up having no stack usage, it will
3061 // not really be added.
3062 unsigned NumRequiredSystemSGPRs =
3063 Info.hasWorkGroupIDX() + Info.hasWorkGroupIDY() +
3064 Info.hasWorkGroupIDZ() + Info.hasWorkGroupInfo();
3065 for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
3066 Register Reg = Info.addReservedUserSGPR();
3067 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3068 CCInfo.AllocateReg(Reg);
3069 }
3070 }
3071
3072 if (!HasArchitectedSGPRs) {
3073 if (Info.hasWorkGroupIDX()) {
3074 Register Reg = Info.addWorkGroupIDX();
3075 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3076 CCInfo.AllocateReg(Reg);
3077 }
3078
3079 if (Info.hasWorkGroupIDY()) {
3080 Register Reg = Info.addWorkGroupIDY();
3081 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3082 CCInfo.AllocateReg(Reg);
3083 }
3084
3085 if (Info.hasWorkGroupIDZ()) {
3086 Register Reg = Info.addWorkGroupIDZ();
3087 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3088 CCInfo.AllocateReg(Reg);
3089 }
3090 }
3091
3092 if (Info.hasWorkGroupInfo()) {
3093 Register Reg = Info.addWorkGroupInfo();
3094 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3095 CCInfo.AllocateReg(Reg);
3096 }
3097
3098 if (Info.hasPrivateSegmentWaveByteOffset()) {
3099 // Scratch wave offset passed in system SGPR.
3100 unsigned PrivateSegmentWaveByteOffsetReg;
3101
3102 if (IsShader) {
3103 PrivateSegmentWaveByteOffsetReg =
3104 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
3105
3106 // This is true if the scratch wave byte offset doesn't have a fixed
3107 // location.
3108 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
3109 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
3110 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
3111 }
3112 } else
3113 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
3114
3115 MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
3116 CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
3117 }
3118
3119 assert(!Subtarget->hasUserSGPRInit16BugInWave32() || IsShader ||
3120 Info.getNumPreloadedSGPRs() >= 16);
3121}
3122
3124 MachineFunction &MF,
3125 const SIRegisterInfo &TRI,
3126 SIMachineFunctionInfo &Info) {
3127 // Now that we've figured out where the scratch register inputs are, see if
3128 // should reserve the arguments and use them directly.
3129 MachineFrameInfo &MFI = MF.getFrameInfo();
3130 bool HasStackObjects = MFI.hasStackObjects();
3131 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
3132
3133 // Record that we know we have non-spill stack objects so we don't need to
3134 // check all stack objects later.
3135 if (HasStackObjects)
3136 Info.setHasNonSpillStackObjects(true);
3137
3138 // Everything live out of a block is spilled with fast regalloc, so it's
3139 // almost certain that spilling will be required.
3141 HasStackObjects = true;
3142
3143 // For now assume stack access is needed in any callee functions, so we need
3144 // the scratch registers to pass in.
3145 bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
3146
3147 if (!ST.hasFlatScratchEnabled()) {
3148 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
3149 // If we have stack objects, we unquestionably need the private buffer
3150 // resource. For the Code Object V2 ABI, this will be the first 4 user
3151 // SGPR inputs. We can reserve those and use them directly.
3152
3153 Register PrivateSegmentBufferReg =
3155 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
3156 } else {
3157 unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
3158 // We tentatively reserve the last registers (skipping the last registers
3159 // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
3160 // we'll replace these with the ones immediately after those which were
3161 // really allocated. In the prologue copies will be inserted from the
3162 // argument to these reserved registers.
3163
3164 // Without HSA, relocations are used for the scratch pointer and the
3165 // buffer resource setup is always inserted in the prologue. Scratch wave
3166 // offset is still in an input SGPR.
3167 Info.setScratchRSrcReg(ReservedBufferReg);
3168 }
3169 }
3170
3172
3173 // For entry functions we have to set up the stack pointer if we use it,
3174 // whereas non-entry functions get this "for free". This means there is no
3175 // intrinsic advantage to using S32 over S34 in cases where we do not have
3176 // calls but do need a frame pointer (i.e. if we are requested to have one
3177 // because frame pointer elimination is disabled). To keep things simple we
3178 // only ever use S32 as the call ABI stack pointer, and so using it does not
3179 // imply we need a separate frame pointer.
3180 //
3181 // Try to use s32 as the SP, but move it if it would interfere with input
3182 // arguments. This won't work with calls though.
3183 //
3184 // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
3185 // registers.
3186 if (!MRI.isLiveIn(AMDGPU::SGPR32)) {
3187 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
3188 } else {
3190
3191 if (MFI.hasCalls())
3192 report_fatal_error("call in graphics shader with too many input SGPRs");
3193
3194 for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
3195 if (!MRI.isLiveIn(Reg)) {
3196 Info.setStackPtrOffsetReg(Reg);
3197 break;
3198 }
3199 }
3200
3201 if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
3202 report_fatal_error("failed to find register for SP");
3203 }
3204
3205 // hasFP should be accurate for entry functions even before the frame is
3206 // finalized, because it does not rely on the known stack size, only
3207 // properties like whether variable sized objects are present.
3208 if (ST.getFrameLowering()->hasFP(MF)) {
3209 Info.setFrameOffsetReg(AMDGPU::SGPR33);
3210 }
3211}
3212
3215 return !Info->isEntryFunction();
3216}
3217
3219
3221 MachineBasicBlock *Entry,
3222 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
3224
3225 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
3226 if (!IStart)
3227 return;
3228
3229 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3230 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
3231 MachineBasicBlock::iterator MBBI = Entry->begin();
3232 for (const MCPhysReg *I = IStart; *I; ++I) {
3233 const TargetRegisterClass *RC = nullptr;
3234 if (AMDGPU::SReg_64RegClass.contains(*I))
3235 RC = &AMDGPU::SGPR_64RegClass;
3236 else if (AMDGPU::SReg_32RegClass.contains(*I))
3237 RC = &AMDGPU::SGPR_32RegClass;
3238 else
3239 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3240
3241 Register NewVR = MRI->createVirtualRegister(RC);
3242 // Create copy from CSR to a virtual register.
3243 Entry->addLiveIn(*I);
3244 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
3245 .addReg(*I);
3246
3247 // Insert the copy-back instructions right before the terminator.
3248 for (auto *Exit : Exits)
3249 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
3250 TII->get(TargetOpcode::COPY), *I)
3251 .addReg(NewVR);
3252 }
3253}
3254
3256 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
3257 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3258 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3260
3262 const Function &Fn = MF.getFunction();
3265 bool IsError = false;
3266
3267 if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) {
3269 Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc()));
3270 IsError = true;
3271 }
3272
3275 BitVector Skipped(Ins.size());
3276 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
3277 *DAG.getContext());
3278
3279 bool IsGraphics = AMDGPU::isGraphics(CallConv);
3280 bool IsKernel = AMDGPU::isKernel(CallConv);
3281 bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
3282
3283 if (IsGraphics) {
3284 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo();
3285 assert(!UserSGPRInfo.hasDispatchPtr() &&
3286 !UserSGPRInfo.hasKernargSegmentPtr() && !Info->hasWorkGroupInfo() &&
3287 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
3288 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
3289 (void)UserSGPRInfo;
3290 if (!Subtarget->hasFlatScratchEnabled())
3291 assert(!UserSGPRInfo.hasFlatScratchInit());
3292 if ((CallConv != CallingConv::AMDGPU_CS &&
3293 CallConv != CallingConv::AMDGPU_Gfx &&
3294 CallConv != CallingConv::AMDGPU_Gfx_WholeWave) ||
3295 !Subtarget->hasArchitectedSGPRs())
3296 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
3297 !Info->hasWorkGroupIDZ());
3298 }
3299
3300 bool IsWholeWaveFunc = Info->isWholeWaveFunction();
3301
3302 if (CallConv == CallingConv::AMDGPU_PS) {
3303 processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
3304
3305 // At least one interpolation mode must be enabled or else the GPU will
3306 // hang.
3307 //
3308 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
3309 // set PSInputAddr, the user wants to enable some bits after the compilation
3310 // based on run-time states. Since we can't know what the final PSInputEna
3311 // will look like, so we shouldn't do anything here and the user should take
3312 // responsibility for the correct programming.
3313 //
3314 // Otherwise, the following restrictions apply:
3315 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
3316 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
3317 // enabled too.
3318 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
3319 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
3320 CCInfo.AllocateReg(AMDGPU::VGPR0);
3321 CCInfo.AllocateReg(AMDGPU::VGPR1);
3322 Info->markPSInputAllocated(0);
3323 Info->markPSInputEnabled(0);
3324 }
3325 if (Subtarget->isAmdPalOS()) {
3326 // For isAmdPalOS, the user does not enable some bits after compilation
3327 // based on run-time states; the register values being generated here are
3328 // the final ones set in hardware. Therefore we need to apply the
3329 // workaround to PSInputAddr and PSInputEnable together. (The case where
3330 // a bit is set in PSInputAddr but not PSInputEnable is where the
3331 // frontend set up an input arg for a particular interpolation mode, but
3332 // nothing uses that input arg. Really we should have an earlier pass
3333 // that removes such an arg.)
3334 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
3335 if ((PsInputBits & 0x7F) == 0 ||
3336 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
3337 Info->markPSInputEnabled(llvm::countr_zero(Info->getPSInputAddr()));
3338 }
3339 } else if (IsKernel) {
3340 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
3341 } else {
3342 Splits.append(IsWholeWaveFunc ? std::next(Ins.begin()) : Ins.begin(),
3343 Ins.end());
3344 }
3345
3346 if (IsKernel)
3347 analyzeFormalArgumentsCompute(CCInfo, Ins);
3348
3349 if (IsEntryFunc) {
3350 allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
3351 allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
3352 if (IsKernel && Subtarget->hasKernargPreload())
3353 allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, *TRI, *Info);
3354
3355 allocateLDSKernelId(CCInfo, MF, *TRI, *Info);
3356 } else if (!IsGraphics) {
3357 // For the fixed ABI, pass workitem IDs in the last argument register.
3358 allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
3359
3360 // FIXME: Sink this into allocateSpecialInputSGPRs
3361 if (!Subtarget->hasFlatScratchEnabled())
3362 CCInfo.AllocateReg(Info->getScratchRSrcReg());
3363
3364 allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
3365 }
3366
3367 if (!IsKernel) {
3368 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
3369 CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
3370
3371 // This assumes the registers are allocated by CCInfo in ascending order
3372 // with no gaps.
3373 Info->setNumWaveDispatchSGPRs(
3374 CCInfo.getFirstUnallocated(AMDGPU::SGPR_32RegClass.getRegisters()));
3375 Info->setNumWaveDispatchVGPRs(
3376 CCInfo.getFirstUnallocated(AMDGPU::VGPR_32RegClass.getRegisters()));
3377 } else if (Info->getNumKernargPreloadedSGPRs()) {
3378 Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs());
3379 }
3380
3382
3383 if (IsWholeWaveFunc) {
3384 SDValue Setup = DAG.getNode(AMDGPUISD::WHOLE_WAVE_SETUP, DL,
3385 {MVT::i1, MVT::Other}, Chain);
3386 InVals.push_back(Setup.getValue(0));
3387 Chains.push_back(Setup.getValue(1));
3388 }
3389
3390 // FIXME: This is the minimum kernel argument alignment. We should improve
3391 // this to the maximum alignment of the arguments.
3392 //
3393 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
3394 // kern arg offset.
3395 const Align KernelArgBaseAlign = Align(16);
3396
3397 for (unsigned i = IsWholeWaveFunc ? 1 : 0, e = Ins.size(), ArgIdx = 0; i != e;
3398 ++i) {
3399 const ISD::InputArg &Arg = Ins[i];
3400 if ((Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) || IsError) {
3401 InVals.push_back(DAG.getPOISON(Arg.VT));
3402 continue;
3403 }
3404
3405 CCValAssign &VA = ArgLocs[ArgIdx++];
3406 MVT VT = VA.getLocVT();
3407
3408 if (IsEntryFunc && VA.isMemLoc()) {
3409 VT = Ins[i].VT;
3410 EVT MemVT = VA.getLocVT();
3411
3412 const uint64_t Offset = VA.getLocMemOffset();
3413 Align Alignment = commonAlignment(KernelArgBaseAlign, Offset);
3414
3415 if (Arg.Flags.isByRef()) {
3416 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, Chain, Offset);
3417
3418 const GCNTargetMachine &TM =
3419 static_cast<const GCNTargetMachine &>(getTargetMachine());
3420 if (!TM.isNoopAddrSpaceCast(AMDGPUAS::CONSTANT_ADDRESS,
3421 Arg.Flags.getPointerAddrSpace())) {
3424 }
3425
3426 InVals.push_back(Ptr);
3427 continue;
3428 }
3429
3430 SDValue NewArg;
3431 if (Arg.isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
3432 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
3433 // In this case the argument is packed into the previous preload SGPR.
3434 int64_t AlignDownOffset = alignDown(Offset, 4);
3435 int64_t OffsetDiff = Offset - AlignDownOffset;
3436 EVT IntVT = MemVT.changeTypeToInteger();
3437
3438 const SIMachineFunctionInfo *Info =
3441 Register Reg =
3442 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
3443
3444 assert(Reg);
3445 Register VReg = MRI.getLiveInVirtReg(Reg);
3446 SDValue Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
3447
3448 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, DL, MVT::i32);
3449 SDValue Extract = DAG.getNode(ISD::SRL, DL, MVT::i32, Copy, ShiftAmt);
3450
3451 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Extract);
3452 ArgVal = DAG.getNode(ISD::BITCAST, DL, MemVT, ArgVal);
3453 NewArg = convertArgType(DAG, VT, MemVT, DL, ArgVal,
3454 Ins[i].Flags.isSExt(), &Ins[i]);
3455
3456 NewArg = DAG.getMergeValues({NewArg, Copy.getValue(1)}, DL);
3457 } else {
3458 const SIMachineFunctionInfo *Info =
3461 const SmallVectorImpl<MCRegister> &PreloadRegs =
3462 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
3463
3464 SDValue Copy;
3465 if (PreloadRegs.size() == 1) {
3466 Register VReg = MRI.getLiveInVirtReg(PreloadRegs[0]);
3467 const TargetRegisterClass *RC = MRI.getRegClass(VReg);
3468 NewArg = DAG.getCopyFromReg(
3469 Chain, DL, VReg,
3471 TRI->getRegSizeInBits(*RC)));
3472
3473 } else {
3474 // If the kernarg alignment does not match the alignment of the SGPR
3475 // tuple RC that can accommodate this argument, it will be built up
3476 // via copies from from the individual SGPRs that the argument was
3477 // preloaded to.
3479 for (auto Reg : PreloadRegs) {
3480 Register VReg = MRI.getLiveInVirtReg(Reg);
3481 Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
3482 Elts.push_back(Copy);
3483 }
3484 NewArg =
3485 DAG.getBuildVector(EVT::getVectorVT(*DAG.getContext(), MVT::i32,
3486 PreloadRegs.size()),
3487 DL, Elts);
3488 }
3489
3490 // If the argument was preloaded to multiple consecutive 32-bit
3491 // registers because of misalignment between addressable SGPR tuples
3492 // and the argument size, we can still assume that because of kernarg
3493 // segment alignment restrictions that NewArg's size is the same as
3494 // MemVT and just do a bitcast. If MemVT is less than 32-bits we add a
3495 // truncate since we cannot preload to less than a single SGPR and the
3496 // MemVT may be smaller.
3497 EVT MemVTInt =
3499 if (MemVT.bitsLT(NewArg.getSimpleValueType()))
3500 NewArg = DAG.getNode(ISD::TRUNCATE, DL, MemVTInt, NewArg);
3501
3502 NewArg = DAG.getBitcast(MemVT, NewArg);
3503 NewArg = convertArgType(DAG, VT, MemVT, DL, NewArg,
3504 Ins[i].Flags.isSExt(), &Ins[i]);
3505 NewArg = DAG.getMergeValues({NewArg, Chain}, DL);
3506 }
3507 } else {
3508 // Hidden arguments that are in the kernel signature must be preloaded
3509 // to user SGPRs. Print a diagnostic error if a hidden argument is in
3510 // the argument list and is not preloaded.
3511 if (Arg.isOrigArg()) {
3512 Argument *OrigArg = Fn.getArg(Arg.getOrigArgIndex());
3513 if (OrigArg->hasAttribute("amdgpu-hidden-argument")) {
3515 *OrigArg->getParent(),
3516 "hidden argument in kernel signature was not preloaded",
3517 DL.getDebugLoc()));
3518 }
3519 }
3520
3521 NewArg =
3522 lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset,
3523 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3524 }
3525 Chains.push_back(NewArg.getValue(1));
3526
3527 auto *ParamTy =
3528 dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
3529 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
3530 ParamTy &&
3531 (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
3532 ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
3533 // On SI local pointers are just offsets into LDS, so they are always
3534 // less than 16-bits. On CI and newer they could potentially be
3535 // real pointers, so we can't guarantee their size.
3536 NewArg = DAG.getNode(ISD::AssertZext, DL, NewArg.getValueType(), NewArg,
3537 DAG.getValueType(MVT::i16));
3538 }
3539
3540 InVals.push_back(NewArg);
3541 continue;
3542 }
3543 if (!IsEntryFunc && VA.isMemLoc()) {
3544 SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
3545 InVals.push_back(Val);
3546 if (!Arg.Flags.isByVal())
3547 Chains.push_back(Val.getValue(1));
3548 continue;
3549 }
3550
3551 assert(VA.isRegLoc() && "Parameter must be in a register!");
3552
3553 Register Reg = VA.getLocReg();
3554 const TargetRegisterClass *RC = nullptr;
3555 if (AMDGPU::VGPR_32RegClass.contains(Reg))
3556 RC = &AMDGPU::VGPR_32RegClass;
3557 else if (AMDGPU::SGPR_32RegClass.contains(Reg))
3558 RC = &AMDGPU::SGPR_32RegClass;
3559 else
3560 llvm_unreachable("Unexpected register class in LowerFormalArguments!");
3561
3562 Reg = MF.addLiveIn(Reg, RC);
3563 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
3564
3565 if (Arg.Flags.isSRet()) {
3566 // The return object should be reasonably addressable.
3567
3568 // FIXME: This helps when the return is a real sret. If it is a
3569 // automatically inserted sret (i.e. CanLowerReturn returns false), an
3570 // extra copy is inserted in SelectionDAGBuilder which obscures this.
3571 unsigned NumBits =
3573 Val = DAG.getNode(
3574 ISD::AssertZext, DL, VT, Val,
3575 DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
3576 }
3577
3578 Val = convertABITypeToValueType(DAG, Val, VA, DL);
3579 InVals.push_back(Val);
3580 }
3581
3582 // Start adding system SGPRs.
3583 if (IsEntryFunc)
3584 allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
3585
3586 if (DAG.getPass()) {
3587 auto &ArgUsageInfo =
3589 ArgUsageInfo.getArgUsageInfo().setFuncArgInfo(Fn, Info->getArgInfo());
3590 } else if (auto *MFAM = DAG.getMFAM()) {
3591 Module &M = *MF.getFunction().getParent();
3592 auto *ArgUsageInfo =
3594 .getCachedResult<AMDGPUArgumentUsageAnalysis>(M);
3595 if (ArgUsageInfo)
3596 ArgUsageInfo->setFuncArgInfo(Fn, Info->getArgInfo());
3597 }
3598
3599 unsigned StackArgSize = CCInfo.getStackSize();
3600 Info->setBytesInStackArgArea(StackArgSize);
3601
3602 return Chains.empty() ? Chain
3603 : DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
3604}
3605
3606// TODO: If return values can't fit in registers, we should return as many as
3607// possible in registers before passing on stack.
3609 CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg,
3610 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context,
3611 const Type *RetTy) const {
3612 // Replacing returns with sret/stack usage doesn't make sense for shaders.
3613 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
3614 // for shaders. Vector types should be explicitly handled by CC.
3615 if (AMDGPU::isEntryFunctionCC(CallConv))
3616 return true;
3617
3619 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3620 if (!CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg)))
3621 return false;
3622
3623 // We must use the stack if return would require unavailable registers.
3624 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3625 unsigned TotalNumVGPRs = Subtarget->getAddressableNumArchVGPRs();
3626 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3627 if (CCInfo.isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3628 return false;
3629
3630 return true;
3631}
3632
3633SDValue
3635 bool isVarArg,
3637 const SmallVectorImpl<SDValue> &OutVals,
3638 const SDLoc &DL, SelectionDAG &DAG) const {
3642
3643 if (AMDGPU::isKernel(CallConv)) {
3644 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
3645 OutVals, DL, DAG);
3646 }
3647
3648 bool IsShader = AMDGPU::isShader(CallConv);
3649
3650 Info->setIfReturnsVoid(Outs.empty());
3651 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3652
3653 // CCValAssign - represent the assignment of the return value to a location.
3655
3656 // CCState - Info about the registers and stack slots.
3657 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3658 *DAG.getContext());
3659
3660 // Analyze outgoing return values.
3661 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3662
3663 SDValue Glue;
3665 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
3666
3667 SDValue ReadFirstLane =
3668 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
3669 // Copy the result values into the output registers.
3670 for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
3671 ++I, ++RealRVLocIdx) {
3672 CCValAssign &VA = RVLocs[I];
3673 assert(VA.isRegLoc() && "Can only return in registers!");
3674 // TODO: Partially return in registers if return values don't fit.
3675 SDValue Arg = OutVals[RealRVLocIdx];
3676
3677 // Copied from other backends.
3678 switch (VA.getLocInfo()) {
3679 case CCValAssign::Full:
3680 break;
3681 case CCValAssign::BCvt:
3682 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3683 break;
3684 case CCValAssign::SExt:
3685 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3686 break;
3687 case CCValAssign::ZExt:
3688 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3689 break;
3690 case CCValAssign::AExt:
3691 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3692 break;
3693 default:
3694 llvm_unreachable("Unknown loc info!");
3695 }
3696 if (TRI->isSGPRPhysReg(VA.getLocReg()))
3698 ReadFirstLane, Arg);
3699 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Glue);
3700 Glue = Chain.getValue(1);
3701 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3702 }
3703
3704 // FIXME: Does sret work properly?
3705 if (!Info->isEntryFunction()) {
3706 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3707 const MCPhysReg *I =
3708 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3709 if (I) {
3710 for (; *I; ++I) {
3711 if (AMDGPU::SReg_64RegClass.contains(*I))
3712 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
3713 else if (AMDGPU::SReg_32RegClass.contains(*I))
3714 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
3715 else
3716 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3717 }
3718 }
3719 }
3720
3721 // Update chain and glue.
3722 RetOps[0] = Chain;
3723 if (Glue.getNode())
3724 RetOps.push_back(Glue);
3725
3726 unsigned Opc = AMDGPUISD::ENDPGM;
3727 if (!IsWaveEnd)
3728 Opc = Info->isWholeWaveFunction() ? AMDGPUISD::WHOLE_WAVE_RETURN
3729 : IsShader ? AMDGPUISD::RETURN_TO_EPILOG
3730 : AMDGPUISD::RET_GLUE;
3731 return DAG.getNode(Opc, DL, MVT::Other, RetOps);
3732}
3733
3735 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool IsVarArg,
3736 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3737 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
3738 SDValue ThisVal) const {
3739 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
3740
3741 // Assign locations to each value returned by this call.
3743 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
3744 *DAG.getContext());
3745 CCInfo.AnalyzeCallResult(Ins, RetCC);
3746
3747 // Copy all of the result registers out of their specified physreg.
3748 for (CCValAssign VA : RVLocs) {
3749 SDValue Val;
3750
3751 if (VA.isRegLoc()) {
3752 Val =
3753 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
3754 Chain = Val.getValue(1);
3755 InGlue = Val.getValue(2);
3756 } else if (VA.isMemLoc()) {
3757 report_fatal_error("TODO: return values in memory");
3758 } else
3759 llvm_unreachable("unknown argument location type");
3760
3761 switch (VA.getLocInfo()) {
3762 case CCValAssign::Full:
3763 break;
3764 case CCValAssign::BCvt:
3765 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
3766 break;
3767 case CCValAssign::ZExt:
3768 Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
3769 DAG.getValueType(VA.getValVT()));
3770 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3771 break;
3772 case CCValAssign::SExt:
3773 Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
3774 DAG.getValueType(VA.getValVT()));
3775 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3776 break;
3777 case CCValAssign::AExt:
3778 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3779 break;
3780 default:
3781 llvm_unreachable("Unknown loc info!");
3782 }
3783
3784 InVals.push_back(Val);
3785 }
3786
3787 return Chain;
3788}
3789
3790// Add code to pass special inputs required depending on used features separate
3791// from the explicit user arguments present in the IR.
3793 CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info,
3794 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
3795 SmallVectorImpl<SDValue> &MemOpChains, SDValue Chain) const {
3796 // If we don't have a call site, this was a call inserted by
3797 // legalization. These can never use special inputs.
3798 if (!CLI.CB)
3799 return;
3800
3801 SelectionDAG &DAG = CLI.DAG;
3802 const SDLoc &DL = CLI.DL;
3803 const Function &F = DAG.getMachineFunction().getFunction();
3804
3805 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3806 const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
3807
3808 const AMDGPUFunctionArgInfo *CalleeArgInfo =
3810 if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) {
3811 if (DAG.getPass()) {
3812 auto &ArgUsageInfo =
3814 CalleeArgInfo =
3815 &ArgUsageInfo.getArgUsageInfo().lookupFuncArgInfo(*CalleeFunc);
3816 } else if (auto *MFAM = DAG.getMFAM()) {
3818 auto *ArgUsageInfo =
3820 DAG.getMachineFunction())
3821 .getCachedResult<AMDGPUArgumentUsageAnalysis>(M);
3822 if (ArgUsageInfo)
3823 CalleeArgInfo = &ArgUsageInfo->lookupFuncArgInfo(*CalleeFunc);
3824 }
3825 }
3826
3827 // TODO: Unify with private memory register handling. This is complicated by
3828 // the fact that at least in kernels, the input argument is not necessarily
3829 // in the same location as the input.
3830 // clang-format off
3831 static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue,
3832 std::array<StringLiteral, 2>> ImplicitAttrs[] = {
3833 {AMDGPUFunctionArgInfo::DISPATCH_PTR, {"amdgpu-no-dispatch-ptr", ""}},
3834 {AMDGPUFunctionArgInfo::QUEUE_PTR, {"amdgpu-no-queue-ptr", ""}},
3835 {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, {"amdgpu-no-implicitarg-ptr", ""}},
3836 {AMDGPUFunctionArgInfo::DISPATCH_ID, {"amdgpu-no-dispatch-id", ""}},
3837 {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, {"amdgpu-no-workgroup-id-x", "amdgpu-no-cluster-id-x"}},
3838 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y, {"amdgpu-no-workgroup-id-y", "amdgpu-no-cluster-id-y"}},
3839 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z, {"amdgpu-no-workgroup-id-z", "amdgpu-no-cluster-id-z"}},
3840 {AMDGPUFunctionArgInfo::LDS_KERNEL_ID, {"amdgpu-no-lds-kernel-id", ""}},
3841 };
3842 // clang-format on
3843
3844 for (auto [InputID, Attrs] : ImplicitAttrs) {
3845 // If the callee does not use the attribute value, skip copying the value.
3846 if (all_of(Attrs, [&](StringRef Attr) {
3847 return Attr.empty() || CLI.CB->hasFnAttr(Attr);
3848 }))
3849 continue;
3850
3851 const auto [OutgoingArg, ArgRC, ArgTy] =
3852 CalleeArgInfo->getPreloadedValue(InputID);
3853 if (!OutgoingArg)
3854 continue;
3855
3856 const auto [IncomingArg, IncomingArgRC, Ty] =
3857 CallerArgInfo.getPreloadedValue(InputID);
3858 assert(IncomingArgRC == ArgRC);
3859
3860 // All special arguments are ints for now.
3861 EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3862 SDValue InputReg;
3863
3864 if (IncomingArg) {
3865 InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
3866 } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
3867 // The implicit arg ptr is special because it doesn't have a corresponding
3868 // input for kernels, and is computed from the kernarg segment pointer.
3869 InputReg = getImplicitArgPtr(DAG, DL);
3870 } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {
3871 std::optional<uint32_t> Id =
3873 if (Id.has_value()) {
3874 InputReg = DAG.getConstant(*Id, DL, ArgVT);
3875 } else {
3876 InputReg = DAG.getPOISON(ArgVT);
3877 }
3878 } else {
3879 // We may have proven the input wasn't needed, although the ABI is
3880 // requiring it. We just need to allocate the register appropriately.
3881 InputReg = DAG.getPOISON(ArgVT);
3882 }
3883
3884 if (OutgoingArg->isRegister()) {
3885 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3886 if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
3887 report_fatal_error("failed to allocate implicit input argument");
3888 } else {
3889 unsigned SpecialArgOffset =
3890 CCInfo.AllocateStack(ArgVT.getStoreSize(), Align(4));
3891 SDValue ArgStore =
3892 storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);
3893 MemOpChains.push_back(ArgStore);
3894 }
3895 }
3896
3897 // Pack workitem IDs into a single register or pass it as is if already
3898 // packed.
3899
3900 auto [OutgoingArg, ArgRC, Ty] =
3902 if (!OutgoingArg)
3903 std::tie(OutgoingArg, ArgRC, Ty) =
3905 if (!OutgoingArg)
3906 std::tie(OutgoingArg, ArgRC, Ty) =
3908 if (!OutgoingArg)
3909 return;
3910
3911 const ArgDescriptor *IncomingArgX = std::get<0>(
3913 const ArgDescriptor *IncomingArgY = std::get<0>(
3915 const ArgDescriptor *IncomingArgZ = std::get<0>(
3917
3918 SDValue InputReg;
3919 SDLoc SL;
3920
3921 const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-x");
3922 const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-y");
3923 const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-z");
3924
3925 // If incoming ids are not packed we need to pack them.
3926 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX &&
3927 NeedWorkItemIDX) {
3928 if (Subtarget->getMaxWorkitemID(F, 0) != 0) {
3929 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
3930 } else {
3931 InputReg = DAG.getConstant(0, DL, MVT::i32);
3932 }
3933 }
3934
3935 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY &&
3936 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(F, 1) != 0) {
3937 SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY);
3938 Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y,
3939 DAG.getShiftAmountConstant(10, MVT::i32, SL));
3940 InputReg = InputReg.getNode()
3941 ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y)
3942 : Y;
3943 }
3944
3945 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ &&
3946 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(F, 2) != 0) {
3947 SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ);
3948 Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z,
3949 DAG.getShiftAmountConstant(20, MVT::i32, SL));
3950 InputReg = InputReg.getNode()
3951 ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z)
3952 : Z;
3953 }
3954
3955 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3956 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3957 // We're in a situation where the outgoing function requires the workitem
3958 // ID, but the calling function does not have it (e.g a graphics function
3959 // calling a C calling convention function). This is illegal, but we need
3960 // to produce something.
3961 InputReg = DAG.getPOISON(MVT::i32);
3962 } else {
3963 // Workitem ids are already packed, any of present incoming arguments
3964 // will carry all required fields.
3965 ArgDescriptor IncomingArg =
3966 ArgDescriptor::createArg(IncomingArgX ? *IncomingArgX
3967 : IncomingArgY ? *IncomingArgY
3968 : *IncomingArgZ,
3969 ~0u);
3970 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg);
3971 }
3972 }
3973
3974 if (OutgoingArg->isRegister()) {
3975 if (InputReg)
3976 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3977
3978 CCInfo.AllocateReg(OutgoingArg->getRegister());
3979 } else {
3980 unsigned SpecialArgOffset = CCInfo.AllocateStack(4, Align(4));
3981 if (InputReg) {
3982 SDValue ArgStore =
3983 storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);
3984 MemOpChains.push_back(ArgStore);
3985 }
3986 }
3987}
3988
3990 SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
3992 const SmallVectorImpl<SDValue> &OutVals,
3993 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3994 if (AMDGPU::isChainCC(CalleeCC))
3995 return true;
3996
3997 if (!AMDGPU::mayTailCallThisCC(CalleeCC))
3998 return false;
3999
4000 // For a divergent call target, we need to do a waterfall loop over the
4001 // possible callees which precludes us from using a simple jump.
4002 if (Callee->isDivergent())
4003 return false;
4004
4006 const Function &CallerF = MF.getFunction();
4007 CallingConv::ID CallerCC = CallerF.getCallingConv();
4009 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
4010
4011 // Kernels aren't callable, and don't have a live in return address so it
4012 // doesn't make sense to do a tail call with entry functions.
4013 if (!CallerPreserved)
4014 return false;
4015
4016 bool CCMatch = CallerCC == CalleeCC;
4017
4019 if (AMDGPU::canGuaranteeTCO(CalleeCC) && CCMatch)
4020 return true;
4021 return false;
4022 }
4023
4024 // TODO: Can we handle var args?
4025 if (IsVarArg)
4026 return false;
4027
4028 for (const Argument &Arg : CallerF.args()) {
4029 if (Arg.hasByValAttr())
4030 return false;
4031 }
4032
4033 LLVMContext &Ctx = *DAG.getContext();
4034
4035 // Check that the call results are passed in the same way.
4036 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
4037 CCAssignFnForCall(CalleeCC, IsVarArg),
4038 CCAssignFnForCall(CallerCC, IsVarArg)))
4039 return false;
4040
4041 // The callee has to preserve all registers the caller needs to preserve.
4042 if (!CCMatch) {
4043 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
4044 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4045 return false;
4046 }
4047
4048 // Nothing more to check if the callee is taking no arguments.
4049 if (Outs.empty())
4050 return true;
4051
4053 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
4054
4055 // FIXME: We are not allocating special input registers, so we will be
4056 // deciding based on incorrect register assignments.
4057 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
4058
4059 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
4060 // If the stack arguments for this call do not fit into our own save area then
4061 // the call cannot be made tail.
4062 // TODO: Is this really necessary?
4063 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
4064 return false;
4065
4066 for (const auto &[CCVA, ArgVal] : zip_equal(ArgLocs, OutVals)) {
4067 // FIXME: What about inreg arguments that end up passed in memory?
4068 if (!CCVA.isRegLoc())
4069 continue;
4070
4071 // If we are passing an argument in an SGPR, and the value is divergent,
4072 // this call requires a waterfall loop.
4073 if (ArgVal->isDivergent() && TRI->isSGPRPhysReg(CCVA.getLocReg())) {
4074 LLVM_DEBUG(
4075 dbgs() << "Cannot tail call due to divergent outgoing argument in "
4076 << printReg(CCVA.getLocReg(), TRI) << '\n');
4077 return false;
4078 }
4079 }
4080
4081 const MachineRegisterInfo &MRI = MF.getRegInfo();
4082 return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
4083}
4084
4086 if (!CI->isTailCall())
4087 return false;
4088
4089 const Function *ParentFn = CI->getFunction();
4091 return false;
4092 return true;
4093}
4094
4095namespace {
4096// Chain calls have special arguments that we need to handle. These are
4097// tagging along at the end of the arguments list(s), after the SGPR and VGPR
4098// arguments (index 0 and 1 respectively).
4099enum ChainCallArgIdx {
4100 Exec = 2,
4101 Flags,
4102 NumVGPRs,
4103 FallbackExec,
4104 FallbackCallee
4105};
4106} // anonymous namespace
4107
4108// The wave scratch offset register is used as the global base pointer.
4110 SmallVectorImpl<SDValue> &InVals) const {
4111 CallingConv::ID CallConv = CLI.CallConv;
4112 bool IsChainCallConv = AMDGPU::isChainCC(CallConv);
4113
4114 SelectionDAG &DAG = CLI.DAG;
4115
4116 const SDLoc &DL = CLI.DL;
4117 SDValue Chain = CLI.Chain;
4118 SDValue Callee = CLI.Callee;
4119
4120 llvm::SmallVector<SDValue, 6> ChainCallSpecialArgs;
4121 bool UsesDynamicVGPRs = false;
4122 if (IsChainCallConv) {
4123 // The last arguments should be the value that we need to put in EXEC,
4124 // followed by the flags and any other arguments with special meanings.
4125 // Pop them out of CLI.Outs and CLI.OutVals before we do any processing so
4126 // we don't treat them like the "real" arguments.
4127 auto RequestedExecIt =
4128 llvm::find_if(CLI.Outs, [](const ISD::OutputArg &Arg) {
4129 return Arg.OrigArgIndex == 2;
4130 });
4131 assert(RequestedExecIt != CLI.Outs.end() && "No node for EXEC");
4132
4133 size_t SpecialArgsBeginIdx = RequestedExecIt - CLI.Outs.begin();
4134 CLI.OutVals.erase(CLI.OutVals.begin() + SpecialArgsBeginIdx,
4135 CLI.OutVals.end());
4136 CLI.Outs.erase(RequestedExecIt, CLI.Outs.end());
4137
4138 assert(CLI.Outs.back().OrigArgIndex < 2 &&
4139 "Haven't popped all the special args");
4140
4141 TargetLowering::ArgListEntry RequestedExecArg =
4142 CLI.Args[ChainCallArgIdx::Exec];
4143 if (!RequestedExecArg.Ty->isIntegerTy(Subtarget->getWavefrontSize()))
4144 return lowerUnhandledCall(CLI, InVals, "Invalid value for EXEC");
4145
4146 // Convert constants into TargetConstants, so they become immediate operands
4147 // instead of being selected into S_MOV.
4148 auto PushNodeOrTargetConstant = [&](TargetLowering::ArgListEntry Arg) {
4149 if (const auto *ArgNode = dyn_cast<ConstantSDNode>(Arg.Node)) {
4150 ChainCallSpecialArgs.push_back(DAG.getTargetConstant(
4151 ArgNode->getAPIntValue(), DL, ArgNode->getValueType(0)));
4152 } else
4153 ChainCallSpecialArgs.push_back(Arg.Node);
4154 };
4155
4156 PushNodeOrTargetConstant(RequestedExecArg);
4157
4158 // Process any other special arguments depending on the value of the flags.
4159 TargetLowering::ArgListEntry Flags = CLI.Args[ChainCallArgIdx::Flags];
4160
4161 const APInt &FlagsValue = cast<ConstantSDNode>(Flags.Node)->getAPIntValue();
4162 if (FlagsValue.isZero()) {
4163 if (CLI.Args.size() > ChainCallArgIdx::Flags + 1)
4164 return lowerUnhandledCall(CLI, InVals,
4165 "no additional args allowed if flags == 0");
4166 } else if (FlagsValue.isOneBitSet(0)) {
4167 if (CLI.Args.size() != ChainCallArgIdx::FallbackCallee + 1) {
4168 return lowerUnhandledCall(CLI, InVals, "expected 3 additional args");
4169 }
4170
4171 if (!Subtarget->isWave32()) {
4172 return lowerUnhandledCall(
4173 CLI, InVals, "dynamic VGPR mode is only supported for wave32");
4174 }
4175
4176 UsesDynamicVGPRs = true;
4177 std::for_each(CLI.Args.begin() + ChainCallArgIdx::NumVGPRs,
4178 CLI.Args.end(), PushNodeOrTargetConstant);
4179 }
4180 }
4181
4183 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
4185 bool &IsTailCall = CLI.IsTailCall;
4186 bool IsVarArg = CLI.IsVarArg;
4187 bool IsSibCall = false;
4189
4190 if (Callee.isUndef() || isNullConstant(Callee)) {
4191 if (!CLI.IsTailCall) {
4192 for (ISD::InputArg &Arg : CLI.Ins)
4193 InVals.push_back(DAG.getPOISON(Arg.VT));
4194 }
4195
4196 return Chain;
4197 }
4198
4199 if (IsVarArg) {
4200 return lowerUnhandledCall(CLI, InVals,
4201 "unsupported call to variadic function ");
4202 }
4203
4204 if (!CLI.CB)
4205 return lowerUnhandledCall(CLI, InVals, "unsupported libcall legalization");
4206
4207 if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
4208 return lowerUnhandledCall(CLI, InVals,
4209 "unsupported required tail call to function ");
4210 }
4211
4212 if (IsTailCall) {
4213 IsTailCall = isEligibleForTailCallOptimization(Callee, CallConv, IsVarArg,
4214 Outs, OutVals, Ins, DAG);
4215 if (!IsTailCall &&
4216 ((CLI.CB && CLI.CB->isMustTailCall()) || IsChainCallConv)) {
4217 report_fatal_error("failed to perform tail call elimination on a call "
4218 "site marked musttail or on llvm.amdgcn.cs.chain");
4219 }
4220
4221 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
4222
4223 // A sibling call is one where we're under the usual C ABI and not planning
4224 // to change that but can still do a tail call:
4225 if (!TailCallOpt && IsTailCall)
4226 IsSibCall = true;
4227
4228 if (IsTailCall)
4229 ++NumTailCalls;
4230 }
4231
4234 SmallVector<SDValue, 8> MemOpChains;
4235
4236 // Analyze operands of the call, assigning locations to each operand.
4238 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
4239 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
4240
4241 if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv) &&
4243 // With a fixed ABI, allocate fixed registers before user arguments.
4244 passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
4245 }
4246
4247 CCInfo.AnalyzeCallOperands(Outs, AssignFn);
4248
4249 // Get a count of how many bytes are to be pushed on the stack.
4250 unsigned NumBytes = CCInfo.getStackSize();
4251
4252 if (IsSibCall) {
4253 // Since we're not changing the ABI to make this a tail call, the memory
4254 // operands are already available in the caller's incoming argument space.
4255 NumBytes = 0;
4256 }
4257
4258 // FPDiff is the byte offset of the call's argument area from the callee's.
4259 // Stores to callee stack arguments will be placed in FixedStackSlots offset
4260 // by this amount for a tail call. In a sibling call it must be 0 because the
4261 // caller will deallocate the entire stack and the callee still expects its
4262 // arguments to begin at SP+0. Completely unused for non-tail calls.
4263 int32_t FPDiff = 0;
4264 MachineFrameInfo &MFI = MF.getFrameInfo();
4265 auto *TRI = Subtarget->getRegisterInfo();
4266
4267 // Adjust the stack pointer for the new arguments...
4268 // These operations are automatically eliminated by the prolog/epilog pass
4269 if (!IsSibCall)
4270 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
4271
4272 if (!IsSibCall || IsChainCallConv) {
4273 if (!Subtarget->hasFlatScratchEnabled()) {
4274 SmallVector<SDValue, 4> CopyFromChains;
4275
4276 // In the HSA case, this should be an identity copy.
4277 SDValue ScratchRSrcReg =
4278 DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
4279 RegsToPass.emplace_back(IsChainCallConv
4280 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
4281 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
4282 ScratchRSrcReg);
4283 CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
4284 Chain = DAG.getTokenFactor(DL, CopyFromChains);
4285 }
4286 }
4287
4288 const unsigned NumSpecialInputs = RegsToPass.size();
4289
4290 MVT PtrVT = MVT::i32;
4291
4292 // Walk the register/memloc assignments, inserting copies/loads.
4293 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4294 CCValAssign &VA = ArgLocs[i];
4295 SDValue Arg = OutVals[i];
4296
4297 // Promote the value if needed.
4298 switch (VA.getLocInfo()) {
4299 case CCValAssign::Full:
4300 break;
4301 case CCValAssign::BCvt:
4302 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
4303 break;
4304 case CCValAssign::ZExt:
4305 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
4306 break;
4307 case CCValAssign::SExt:
4308 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
4309 break;
4310 case CCValAssign::AExt:
4311 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
4312 break;
4313 case CCValAssign::FPExt:
4314 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
4315 break;
4316 default:
4317 llvm_unreachable("Unknown loc info!");
4318 }
4319
4320 if (VA.isRegLoc()) {
4321 RegsToPass.push_back(std::pair(VA.getLocReg(), Arg));
4322 } else {
4323 assert(VA.isMemLoc());
4324
4325 SDValue DstAddr;
4326 MachinePointerInfo DstInfo;
4327
4328 unsigned LocMemOffset = VA.getLocMemOffset();
4329 int32_t Offset = LocMemOffset;
4330
4331 SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
4332 MaybeAlign Alignment;
4333
4334 if (IsTailCall) {
4335 ISD::ArgFlagsTy Flags = Outs[i].Flags;
4336 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize()
4337 : VA.getValVT().getStoreSize();
4338
4339 // FIXME: We can have better than the minimum byval required alignment.
4340 Alignment =
4341 Flags.isByVal()
4342 ? Flags.getNonZeroByValAlign()
4343 : commonAlignment(Subtarget->getStackAlignment(), Offset);
4344
4345 Offset = Offset + FPDiff;
4346 int FI = MFI.CreateFixedObject(OpSize, Offset, true);
4347
4348 DstAddr = DAG.getFrameIndex(FI, PtrVT);
4349 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
4350
4351 // Make sure any stack arguments overlapping with where we're storing
4352 // are loaded before this eventual operation. Otherwise they'll be
4353 // clobbered.
4354
4355 // FIXME: Why is this really necessary? This seems to just result in a
4356 // lot of code to copy the stack and write them back to the same
4357 // locations, which are supposed to be immutable?
4358 Chain = addTokenForArgument(Chain, DAG, MFI, FI);
4359 } else {
4360 // Stores to the argument stack area are relative to the stack pointer.
4361 SDValue SP = DAG.getCopyFromReg(Chain, DL, Info->getStackPtrOffsetReg(),
4362 MVT::i32);
4363 DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, SP, PtrOff);
4364 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
4365 Alignment =
4366 commonAlignment(Subtarget->getStackAlignment(), LocMemOffset);
4367 }
4368
4369 if (Outs[i].Flags.isByVal()) {
4370 SDValue SizeNode =
4371 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
4372 SDValue Cpy =
4373 DAG.getMemcpy(Chain, DL, DstAddr, Arg, SizeNode,
4374 Outs[i].Flags.getNonZeroByValAlign(),
4375 /*isVol = */ false, /*AlwaysInline = */ true,
4376 /*CI=*/nullptr, std::nullopt, DstInfo,
4378
4379 MemOpChains.push_back(Cpy);
4380 } else {
4381 SDValue Store =
4382 DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Alignment);
4383 MemOpChains.push_back(Store);
4384 }
4385 }
4386 }
4387
4388 if (!MemOpChains.empty())
4389 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
4390
4391 SDValue ReadFirstLaneID =
4392 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
4393
4394 SDValue TokenGlue;
4395 if (CLI.ConvergenceControlToken) {
4396 TokenGlue = DAG.getNode(ISD::CONVERGENCECTRL_GLUE, DL, MVT::Glue,
4398 }
4399
4400 // Build a sequence of copy-to-reg nodes chained together with token chain
4401 // and flag operands which copy the outgoing args into the appropriate regs.
4402 SDValue InGlue;
4403
4404 unsigned ArgIdx = 0;
4405 for (auto [Reg, Val] : RegsToPass) {
4406 if (ArgIdx++ >= NumSpecialInputs &&
4407 (IsChainCallConv || !Val->isDivergent()) && TRI->isSGPRPhysReg(Reg)) {
4408 // For chain calls, the inreg arguments are required to be
4409 // uniform. Speculatively Insert a readfirstlane in case we cannot prove
4410 // they are uniform.
4411 //
4412 // For other calls, if an inreg arguments is known to be uniform,
4413 // speculatively insert a readfirstlane in case it is in a VGPR.
4414 //
4415 // FIXME: We need to execute this in a waterfall loop if it is a divergent
4416 // value, so let that continue to produce invalid code.
4417
4418 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Val});
4419 if (TokenGlue)
4420 ReadfirstlaneArgs.push_back(TokenGlue);
4422 ReadfirstlaneArgs);
4423 }
4424
4425 Chain = DAG.getCopyToReg(Chain, DL, Reg, Val, InGlue);
4426 InGlue = Chain.getValue(1);
4427 }
4428
4429 // We don't usually want to end the call-sequence here because we would tidy
4430 // the frame up *after* the call, however in the ABI-changing tail-call case
4431 // we've carefully laid out the parameters so that when sp is reset they'll be
4432 // in the correct location.
4433 if (IsTailCall && !IsSibCall) {
4434 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, 0, InGlue, DL);
4435 InGlue = Chain.getValue(1);
4436 }
4437
4438 std::vector<SDValue> Ops({Chain});
4439
4440 // Add a redundant copy of the callee global which will not be legalized, as
4441 // we need direct access to the callee later.
4443 const GlobalValue *GV = GSD->getGlobal();
4444 Ops.push_back(Callee);
4445 Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
4446 } else {
4447 if (IsTailCall) {
4448 // isEligibleForTailCallOptimization considered whether the call target is
4449 // divergent, but we may still end up with a uniform value in a VGPR.
4450 // Insert a readfirstlane just in case.
4451 SDValue ReadFirstLaneID =
4452 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
4453
4454 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Callee});
4455 if (TokenGlue)
4456 ReadfirstlaneArgs.push_back(TokenGlue); // Wire up convergence token.
4457 Callee = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Callee.getValueType(),
4458 ReadfirstlaneArgs);
4459 }
4460
4461 Ops.push_back(Callee);
4462 Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
4463 }
4464
4465 if (IsTailCall) {
4466 // Each tail call may have to adjust the stack by a different amount, so
4467 // this information must travel along with the operation for eventual
4468 // consumption by emitEpilogue.
4469 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
4470 }
4471
4472 if (IsChainCallConv)
4473 llvm::append_range(Ops, ChainCallSpecialArgs);
4474
4475 // Add argument registers to the end of the list so that they are known live
4476 // into the call.
4477 for (auto &[Reg, Val] : RegsToPass)
4478 Ops.push_back(DAG.getRegister(Reg, Val.getValueType()));
4479
4480 // Add a register mask operand representing the call-preserved registers.
4481 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
4482 assert(Mask && "Missing call preserved mask for calling convention");
4483 Ops.push_back(DAG.getRegisterMask(Mask));
4484
4485 if (SDValue Token = CLI.ConvergenceControlToken) {
4487 GlueOps.push_back(Token);
4488 if (InGlue)
4489 GlueOps.push_back(InGlue);
4490
4491 InGlue = SDValue(DAG.getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, DL,
4492 MVT::Glue, GlueOps),
4493 0);
4494 }
4495
4496 if (InGlue)
4497 Ops.push_back(InGlue);
4498
4499 // If we're doing a tall call, use a TC_RETURN here rather than an
4500 // actual call instruction.
4501 if (IsTailCall) {
4502 MFI.setHasTailCall();
4503 unsigned OPC = AMDGPUISD::TC_RETURN;
4504 switch (CallConv) {
4506 OPC = AMDGPUISD::TC_RETURN_GFX;
4507 break;
4510 OPC = UsesDynamicVGPRs ? AMDGPUISD::TC_RETURN_CHAIN_DVGPR
4511 : AMDGPUISD::TC_RETURN_CHAIN;
4512 break;
4513 }
4514
4515 // If the caller is a whole wave function, we need to use a special opcode
4516 // so we can patch up EXEC.
4517 if (Info->isWholeWaveFunction())
4518 OPC = AMDGPUISD::TC_RETURN_GFX_WholeWave;
4519
4520 return DAG.getNode(OPC, DL, MVT::Other, Ops);
4521 }
4522
4523 // Returns a chain and a flag for retval copy to use.
4524 SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, {MVT::Other, MVT::Glue}, Ops);
4525 Chain = Call.getValue(0);
4526 InGlue = Call.getValue(1);
4527
4528 uint64_t CalleePopBytes = NumBytes;
4529 Chain = DAG.getCALLSEQ_END(Chain, 0, CalleePopBytes, InGlue, DL);
4530 if (!Ins.empty())
4531 InGlue = Chain.getValue(1);
4532
4533 // Handle result values, copying them out of physregs into vregs that we
4534 // return.
4535 return LowerCallResult(Chain, InGlue, CallConv, IsVarArg, Ins, DL, DAG,
4536 InVals, /*IsThisReturn=*/false, SDValue());
4537}
4538
4539// This is similar to the default implementation in ExpandDYNAMIC_STACKALLOC,
4540// except for:
4541// 1. Stack growth direction(default: downwards, AMDGPU: upwards), and
4542// 2. Scale size where, scale = wave-reduction(alloca-size) * wave-size
4544 SelectionDAG &DAG) const {
4545 const MachineFunction &MF = DAG.getMachineFunction();
4547
4548 SDLoc dl(Op);
4549 EVT VT = Op.getValueType();
4550 SDValue Chain = Op.getOperand(0);
4551 Register SPReg = Info->getStackPtrOffsetReg();
4552
4553 // Chain the dynamic stack allocation so that it doesn't modify the stack
4554 // pointer when other instructions are using the stack.
4555 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
4556
4557 SDValue Size = Op.getOperand(1);
4558 SDValue BaseAddr = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
4559 Align Alignment = cast<ConstantSDNode>(Op.getOperand(2))->getAlignValue();
4560
4561 const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
4563 "Stack grows upwards for AMDGPU");
4564
4565 Chain = BaseAddr.getValue(1);
4566 Align StackAlign = TFL->getStackAlign();
4567 if (Alignment > StackAlign) {
4568 uint64_t ScaledAlignment = Alignment.value()
4569 << Subtarget->getWavefrontSizeLog2();
4570 uint64_t StackAlignMask = ScaledAlignment - 1;
4571 SDValue TmpAddr = DAG.getNode(ISD::ADD, dl, VT, BaseAddr,
4572 DAG.getConstant(StackAlignMask, dl, VT));
4573 BaseAddr = DAG.getNode(ISD::AND, dl, VT, TmpAddr,
4574 DAG.getSignedConstant(-ScaledAlignment, dl, VT));
4575 }
4576
4577 assert(Size.getValueType() == MVT::i32 && "Size must be 32-bit");
4578 SDValue NewSP;
4580 // For constant sized alloca, scale alloca size by wave-size
4581 SDValue ScaledSize = DAG.getNode(
4582 ISD::SHL, dl, VT, Size,
4583 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4584 NewSP = DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value
4585 } else {
4586 // For dynamic sized alloca, perform wave-wide reduction to get max of
4587 // alloca size(divergent) and then scale it by wave-size
4588 SDValue WaveReduction =
4589 DAG.getTargetConstant(Intrinsic::amdgcn_wave_reduce_umax, dl, MVT::i32);
4590 Size = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, WaveReduction,
4591 Size, DAG.getConstant(0, dl, MVT::i32));
4592 SDValue ScaledSize = DAG.getNode(
4593 ISD::SHL, dl, VT, Size,
4594 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4595 NewSP =
4596 DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value in vgpr.
4597 SDValue ReadFirstLaneID =
4598 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, dl, MVT::i32);
4599 NewSP = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, ReadFirstLaneID,
4600 NewSP);
4601 }
4602
4603 Chain = DAG.getCopyToReg(Chain, dl, SPReg, NewSP); // Output chain
4604 SDValue CallSeqEnd = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
4605
4606 return DAG.getMergeValues({BaseAddr, CallSeqEnd}, dl);
4607}
4608
4610 if (Op.getValueType() != MVT::i32)
4611 return Op; // Defer to cannot select error.
4612
4614 SDLoc SL(Op);
4615
4616 SDValue CopyFromSP = DAG.getCopyFromReg(Op->getOperand(0), SL, SP, MVT::i32);
4617
4618 // Convert from wave uniform to swizzled vector address. This should protect
4619 // from any edge cases where the stacksave result isn't directly used with
4620 // stackrestore.
4621 SDValue VectorAddress =
4622 DAG.getNode(AMDGPUISD::WAVE_ADDRESS, SL, MVT::i32, CopyFromSP);
4623 return DAG.getMergeValues({VectorAddress, CopyFromSP.getValue(1)}, SL);
4624}
4625
4627 SelectionDAG &DAG) const {
4628 SDLoc SL(Op);
4629 assert(Op.getValueType() == MVT::i32);
4630
4631 uint32_t BothRoundHwReg =
4633 SDValue GetRoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4634
4635 SDValue IntrinID =
4636 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4637 SDValue GetReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, Op->getVTList(),
4638 Op.getOperand(0), IntrinID, GetRoundBothImm);
4639
4640 // There are two rounding modes, one for f32 and one for f64/f16. We only
4641 // report in the standard value range if both are the same.
4642 //
4643 // The raw values also differ from the expected FLT_ROUNDS values. Nearest
4644 // ties away from zero is not supported, and the other values are rotated by
4645 // 1.
4646 //
4647 // If the two rounding modes are not the same, report a target defined value.
4648
4649 // Mode register rounding mode fields:
4650 //
4651 // [1:0] Single-precision round mode.
4652 // [3:2] Double/Half-precision round mode.
4653 //
4654 // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero.
4655 //
4656 // Hardware Spec
4657 // Toward-0 3 0
4658 // Nearest Even 0 1
4659 // +Inf 1 2
4660 // -Inf 2 3
4661 // NearestAway0 N/A 4
4662 //
4663 // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit
4664 // table we can index by the raw hardware mode.
4665 //
4666 // (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf
4667
4668 SDValue BitTable =
4670
4671 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4672 SDValue RoundModeTimesNumBits =
4673 DAG.getNode(ISD::SHL, SL, MVT::i32, GetReg, Two);
4674
4675 // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we
4676 // knew only one mode was demanded.
4677 SDValue TableValue =
4678 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4679 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4680
4681 SDValue EntryMask = DAG.getConstant(0xf, SL, MVT::i32);
4682 SDValue TableEntry =
4683 DAG.getNode(ISD::AND, SL, MVT::i32, TruncTable, EntryMask);
4684
4685 // There's a gap in the 4-bit encoded table and actual enum values, so offset
4686 // if it's an extended value.
4687 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4688 SDValue IsStandardValue =
4689 DAG.getSetCC(SL, MVT::i1, TableEntry, Four, ISD::SETULT);
4690 SDValue EnumOffset = DAG.getNode(ISD::ADD, SL, MVT::i32, TableEntry, Four);
4691 SDValue Result = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue,
4692 TableEntry, EnumOffset);
4693
4694 return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL);
4695}
4696
4698 SelectionDAG &DAG) const {
4699 SDLoc SL(Op);
4700
4701 SDValue NewMode = Op.getOperand(1);
4702 assert(NewMode.getValueType() == MVT::i32);
4703
4704 // Index a table of 4-bit entries mapping from the C FLT_ROUNDS values to the
4705 // hardware MODE.fp_round values.
4706 if (auto *ConstMode = dyn_cast<ConstantSDNode>(NewMode)) {
4707 uint32_t ClampedVal = std::min(
4708 static_cast<uint32_t>(ConstMode->getZExtValue()),
4710 NewMode = DAG.getConstant(
4711 AMDGPU::decodeFltRoundToHWConversionTable(ClampedVal), SL, MVT::i32);
4712 } else {
4713 // If we know the input can only be one of the supported standard modes in
4714 // the range 0-3, we can use a simplified mapping to hardware values.
4715 KnownBits KB = DAG.computeKnownBits(NewMode);
4716 const bool UseReducedTable = KB.countMinLeadingZeros() >= 30;
4717 // The supported standard values are 0-3. The extended values start at 8. We
4718 // need to offset by 4 if the value is in the extended range.
4719
4720 if (UseReducedTable) {
4721 // Truncate to the low 32-bits.
4722 SDValue BitTable = DAG.getConstant(
4723 AMDGPU::FltRoundToHWConversionTable & 0xffff, SL, MVT::i32);
4724
4725 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4726 SDValue RoundModeTimesNumBits =
4727 DAG.getNode(ISD::SHL, SL, MVT::i32, NewMode, Two);
4728
4729 NewMode =
4730 DAG.getNode(ISD::SRL, SL, MVT::i32, BitTable, RoundModeTimesNumBits);
4731
4732 // TODO: SimplifyDemandedBits on the setreg source here can likely reduce
4733 // the table extracted bits into inline immediates.
4734 } else {
4735 // table_index = umin(value, value - 4)
4736 // MODE.fp_round = (bit_table >> (table_index << 2)) & 0xf
4737 SDValue BitTable =
4739
4740 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4741 SDValue OffsetEnum = DAG.getNode(ISD::SUB, SL, MVT::i32, NewMode, Four);
4742 SDValue IndexVal =
4743 DAG.getNode(ISD::UMIN, SL, MVT::i32, NewMode, OffsetEnum);
4744
4745 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4746 SDValue RoundModeTimesNumBits =
4747 DAG.getNode(ISD::SHL, SL, MVT::i32, IndexVal, Two);
4748
4749 SDValue TableValue =
4750 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4751 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4752
4753 // No need to mask out the high bits since the setreg will ignore them
4754 // anyway.
4755 NewMode = TruncTable;
4756 }
4757
4758 // Insert a readfirstlane in case the value is a VGPR. We could do this
4759 // earlier and keep more operations scalar, but that interferes with
4760 // combining the source.
4761 SDValue ReadFirstLaneID =
4762 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4763 NewMode = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4764 ReadFirstLaneID, NewMode);
4765 }
4766
4767 // N.B. The setreg will be later folded into s_round_mode on supported
4768 // targets.
4769 SDValue IntrinID =
4770 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4771 uint32_t BothRoundHwReg =
4773 SDValue RoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4774
4775 SDValue SetReg =
4776 DAG.getNode(ISD::INTRINSIC_VOID, SL, Op->getVTList(), Op.getOperand(0),
4777 IntrinID, RoundBothImm, NewMode);
4778
4779 return SetReg;
4780}
4781
4783 if (Op->isDivergent() &&
4784 (!Subtarget->hasVmemPrefInsts() || !Op.getConstantOperandVal(4)))
4785 // Cannot do I$ prefetch with divergent pointer.
4786 return SDValue();
4787
4788 switch (cast<MemSDNode>(Op)->getAddressSpace()) {
4792 break;
4794 if (Subtarget->hasSafeSmemPrefetch())
4795 break;
4796 [[fallthrough]];
4797 default:
4798 return SDValue();
4799 }
4800
4801 // I$ prefetch
4802 if (!Subtarget->hasSafeSmemPrefetch() && !Op.getConstantOperandVal(4))
4803 return SDValue();
4804
4805 return Op;
4806}
4807
4808// Work around DAG legality rules only based on the result type.
4810 bool IsStrict = Op.getOpcode() == ISD::STRICT_FP_EXTEND;
4811 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
4812 EVT SrcVT = Src.getValueType();
4813
4814 if (SrcVT.getScalarType() != MVT::bf16)
4815 return Op;
4816
4817 SDLoc SL(Op);
4818 SDValue BitCast =
4819 DAG.getNode(ISD::BITCAST, SL, SrcVT.changeTypeToInteger(), Src);
4820
4821 EVT DstVT = Op.getValueType();
4822 if (IsStrict)
4823 llvm_unreachable("Need STRICT_BF16_TO_FP");
4824
4825 return DAG.getNode(ISD::BF16_TO_FP, SL, DstVT, BitCast);
4826}
4827
4829 SDLoc SL(Op);
4830 if (Op.getValueType() != MVT::i64)
4831 return Op;
4832
4833 uint32_t ModeHwReg =
4835 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4836 uint32_t TrapHwReg =
4838 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4839
4840 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::Other);
4841 SDValue IntrinID =
4842 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4843 SDValue GetModeReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4844 Op.getOperand(0), IntrinID, ModeHwRegImm);
4845 SDValue GetTrapReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4846 Op.getOperand(0), IntrinID, TrapHwRegImm);
4847 SDValue TokenReg =
4848 DAG.getNode(ISD::TokenFactor, SL, MVT::Other, GetModeReg.getValue(1),
4849 GetTrapReg.getValue(1));
4850
4851 SDValue CvtPtr =
4852 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, GetModeReg, GetTrapReg);
4853 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
4854
4855 return DAG.getMergeValues({Result, TokenReg}, SL);
4856}
4857
4859 SDLoc SL(Op);
4860 if (Op.getOperand(1).getValueType() != MVT::i64)
4861 return Op;
4862
4863 SDValue Input = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op.getOperand(1));
4864 SDValue NewModeReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4865 DAG.getConstant(0, SL, MVT::i32));
4866 SDValue NewTrapReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4867 DAG.getConstant(1, SL, MVT::i32));
4868
4869 SDValue ReadFirstLaneID =
4870 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4871 NewModeReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4872 ReadFirstLaneID, NewModeReg);
4873 NewTrapReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4874 ReadFirstLaneID, NewTrapReg);
4875
4876 unsigned ModeHwReg =
4878 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4879 unsigned TrapHwReg =
4881 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4882
4883 SDValue IntrinID =
4884 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4885 SDValue SetModeReg =
4886 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4887 IntrinID, ModeHwRegImm, NewModeReg);
4888 SDValue SetTrapReg =
4889 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4890 IntrinID, TrapHwRegImm, NewTrapReg);
4891 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, SetTrapReg, SetModeReg);
4892}
4893
4895 const MachineFunction &MF) const {
4896 const Function &Fn = MF.getFunction();
4897
4899 .Case("m0", AMDGPU::M0)
4900 .Case("exec", AMDGPU::EXEC)
4901 .Case("exec_lo", AMDGPU::EXEC_LO)
4902 .Case("exec_hi", AMDGPU::EXEC_HI)
4903 .Case("flat_scratch", AMDGPU::FLAT_SCR)
4904 .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4905 .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4906 .Default(Register());
4907 if (!Reg)
4908 return Reg;
4909
4910 if (!Subtarget->hasFlatScrRegister() &&
4911 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
4912 Fn.getContext().emitError(Twine("invalid register \"" + StringRef(RegName) +
4913 "\" for subtarget."));
4914 }
4915
4916 switch (Reg) {
4917 case AMDGPU::M0:
4918 case AMDGPU::EXEC_LO:
4919 case AMDGPU::EXEC_HI:
4920 case AMDGPU::FLAT_SCR_LO:
4921 case AMDGPU::FLAT_SCR_HI:
4922 if (VT.getSizeInBits() == 32)
4923 return Reg;
4924 break;
4925 case AMDGPU::EXEC:
4926 case AMDGPU::FLAT_SCR:
4927 if (VT.getSizeInBits() == 64)
4928 return Reg;
4929 break;
4930 default:
4931 llvm_unreachable("missing register type checking");
4932 }
4933
4935 Twine("invalid type for register \"" + StringRef(RegName) + "\"."));
4936}
4937
4938// If kill is not the last instruction, split the block so kill is always a
4939// proper terminator.
4942 MachineBasicBlock *BB) const {
4943 MachineBasicBlock *SplitBB = BB->splitAt(MI, /*UpdateLiveIns=*/true);
4945 MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
4946 return SplitBB;
4947}
4948
4949// Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
4950// \p MI will be the only instruction in the loop body block. Otherwise, it will
4951// be the first instruction in the remainder block.
4952//
4953/// \returns { LoopBody, Remainder }
4954static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4956 MachineFunction *MF = MBB.getParent();
4958
4959 // To insert the loop we need to split the block. Move everything after this
4960 // point to a new block, and insert a new empty block between the two.
4962 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
4964 ++MBBI;
4965
4966 MF->insert(MBBI, LoopBB);
4967 MF->insert(MBBI, RemainderBB);
4968
4969 LoopBB->addSuccessor(LoopBB);
4970 LoopBB->addSuccessor(RemainderBB);
4971
4972 // Move the rest of the block into a new block.
4973 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
4974
4975 if (InstInLoop) {
4976 auto Next = std::next(I);
4977
4978 // Move instruction to loop body.
4979 LoopBB->splice(LoopBB->begin(), &MBB, I, Next);
4980
4981 // Move the rest of the block.
4982 RemainderBB->splice(RemainderBB->begin(), &MBB, Next, MBB.end());
4983 } else {
4984 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
4985 }
4986
4987 MBB.addSuccessor(LoopBB);
4988
4989 return std::pair(LoopBB, RemainderBB);
4990}
4991
4992/// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
4994 MachineBasicBlock *MBB = MI.getParent();
4996 auto I = MI.getIterator();
4997 auto E = std::next(I);
4998
4999 // clang-format off
5000 BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
5001 .addImm(0);
5002 // clang-format on
5003
5004 MIBundleBuilder Bundler(*MBB, I, E);
5005 finalizeBundle(*MBB, Bundler.begin());
5006}
5007
5010 MachineBasicBlock *BB) const {
5011 const DebugLoc &DL = MI.getDebugLoc();
5012
5014
5016
5017 // Apparently kill flags are only valid if the def is in the same block?
5018 if (MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0))
5019 Src->setIsKill(false);
5020
5021 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, *BB, true);
5022
5023 MachineBasicBlock::iterator I = LoopBB->end();
5024
5025 const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
5027
5028 // Clear TRAP_STS.MEM_VIOL
5029 BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
5030 .addImm(0)
5031 .addImm(EncodedReg);
5032
5034
5035 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5036
5037 // Load and check TRAP_STS.MEM_VIOL
5038 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg)
5039 .addImm(EncodedReg);
5040
5041 // FIXME: Do we need to use an isel pseudo that may clobber scc?
5042 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5043 .addReg(Reg, RegState::Kill)
5044 .addImm(0);
5045 // clang-format off
5046 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5047 .addMBB(LoopBB);
5048 // clang-format on
5049
5050 return RemainderBB;
5051}
5052
5053// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
5054// wavefront. If the value is uniform and just happens to be in a VGPR, this
5055// will only do one iteration. In the worst case, this will loop 64 times.
5056//
5057// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
5060 MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
5061 const DebugLoc &DL, const MachineOperand &Idx,
5062 unsigned InitReg, unsigned ResultReg, unsigned PhiReg,
5063 unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode,
5064 Register &SGPRIdxReg) {
5065
5066 MachineFunction *MF = OrigBB.getParent();
5067 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5068 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5071
5072 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5073 Register PhiExec = MRI.createVirtualRegister(BoolRC);
5074 Register NewExec = MRI.createVirtualRegister(BoolRC);
5075 Register CurrentIdxReg =
5076 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5077 Register CondReg = MRI.createVirtualRegister(BoolRC);
5078
5079 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
5080 .addReg(InitReg)
5081 .addMBB(&OrigBB)
5082 .addReg(ResultReg)
5083 .addMBB(&LoopBB);
5084
5085 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
5086 .addReg(InitSaveExecReg)
5087 .addMBB(&OrigBB)
5088 .addReg(NewExec)
5089 .addMBB(&LoopBB);
5090
5091 // Read the next variant <- also loop target.
5092 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
5093 .addReg(Idx.getReg(), getUndefRegState(Idx.isUndef()));
5094
5095 // Compare the just read M0 value to all possible Idx values.
5096 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
5097 .addReg(CurrentIdxReg)
5098 .addReg(Idx.getReg(), {}, Idx.getSubReg());
5099
5100 // Update EXEC, save the original EXEC value to VCC.
5101 BuildMI(LoopBB, I, DL, TII->get(LMC.AndSaveExecOpc), NewExec)
5102 .addReg(CondReg, RegState::Kill);
5103
5104 MRI.setSimpleHint(NewExec, CondReg);
5105
5106 if (UseGPRIdxMode) {
5107 if (Offset == 0) {
5108 SGPRIdxReg = CurrentIdxReg;
5109 } else {
5110 SGPRIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
5111 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
5112 .addReg(CurrentIdxReg, RegState::Kill)
5113 .addImm(Offset);
5114 }
5115 } else {
5116 // Move index from VCC into M0
5117 if (Offset == 0) {
5118 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
5119 .addReg(CurrentIdxReg, RegState::Kill);
5120 } else {
5121 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
5122 .addReg(CurrentIdxReg, RegState::Kill)
5123 .addImm(Offset);
5124 }
5125 }
5126
5127 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
5128 MachineInstr *InsertPt =
5129 BuildMI(LoopBB, I, DL, TII->get(LMC.XorTermOpc), LMC.ExecReg)
5130 .addReg(LMC.ExecReg)
5131 .addReg(NewExec);
5132
5133 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
5134 // s_cbranch_scc0?
5135
5136 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
5137 // clang-format off
5138 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
5139 .addMBB(&LoopBB);
5140 // clang-format on
5141
5142 return InsertPt->getIterator();
5143}
5144
5145// This has slightly sub-optimal regalloc when the source vector is killed by
5146// the read. The register allocator does not understand that the kill is
5147// per-workitem, so is kept alive for the whole loop so we end up not re-using a
5148// subregister from it, using 1 more VGPR than necessary. This was saved when
5149// this was expanded after register allocation.
5152 unsigned InitResultReg, unsigned PhiReg, int Offset,
5153 bool UseGPRIdxMode, Register &SGPRIdxReg) {
5154 MachineFunction *MF = MBB.getParent();
5155 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5156 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5158 const DebugLoc &DL = MI.getDebugLoc();
5160
5161 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
5162 Register DstReg = MI.getOperand(0).getReg();
5163 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
5164 Register TmpExec = MRI.createVirtualRegister(BoolXExecRC);
5166
5167 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
5168
5169 // Save the EXEC mask
5170 // clang-format off
5171 BuildMI(MBB, I, DL, TII->get(LMC.MovOpc), SaveExec)
5172 .addReg(LMC.ExecReg);
5173 // clang-format on
5174
5175 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, MBB, false);
5176
5177 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5178
5179 auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
5180 InitResultReg, DstReg, PhiReg, TmpExec,
5181 Offset, UseGPRIdxMode, SGPRIdxReg);
5182
5183 MachineBasicBlock *LandingPad = MF->CreateMachineBasicBlock();
5185 ++MBBI;
5186 MF->insert(MBBI, LandingPad);
5187 LoopBB->removeSuccessor(RemainderBB);
5188 LandingPad->addSuccessor(RemainderBB);
5189 LoopBB->addSuccessor(LandingPad);
5190 MachineBasicBlock::iterator First = LandingPad->begin();
5191 // clang-format off
5192 BuildMI(*LandingPad, First, DL, TII->get(LMC.MovOpc), LMC.ExecReg)
5193 .addReg(SaveExec);
5194 // clang-format on
5195
5196 return InsPt;
5197}
5198
5199// Returns subreg index, offset
5200static std::pair<unsigned, int>
5202 const TargetRegisterClass *SuperRC, unsigned VecReg,
5203 int Offset) {
5204 int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
5205
5206 // Skip out of bounds offsets, or else we would end up using an undefined
5207 // register.
5208 if (Offset >= NumElts || Offset < 0)
5209 return std::pair(AMDGPU::sub0, Offset);
5210
5211 return std::pair(SIRegisterInfo::getSubRegFromChannel(Offset), 0);
5212}
5213
5216 int Offset) {
5217 MachineBasicBlock *MBB = MI.getParent();
5218 const DebugLoc &DL = MI.getDebugLoc();
5220
5221 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5222
5223 assert(Idx->getReg() != AMDGPU::NoRegister);
5224
5225 if (Offset == 0) {
5226 // clang-format off
5227 BuildMI(*MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
5228 .add(*Idx);
5229 // clang-format on
5230 } else {
5231 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
5232 .add(*Idx)
5233 .addImm(Offset);
5234 }
5235}
5236
5239 int Offset) {
5240 MachineBasicBlock *MBB = MI.getParent();
5241 const DebugLoc &DL = MI.getDebugLoc();
5243
5244 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5245
5246 if (Offset == 0)
5247 return Idx->getReg();
5248
5249 Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5250 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
5251 .add(*Idx)
5252 .addImm(Offset);
5253 return Tmp;
5254}
5255
5258 const GCNSubtarget &ST) {
5259 const SIInstrInfo *TII = ST.getInstrInfo();
5260 const SIRegisterInfo &TRI = TII->getRegisterInfo();
5261 MachineFunction *MF = MBB.getParent();
5263
5264 Register Dst = MI.getOperand(0).getReg();
5265 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5266 Register SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
5267 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
5268
5269 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
5270 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
5271
5272 unsigned SubReg;
5273 std::tie(SubReg, Offset) =
5274 computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
5275
5276 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5277
5278 // Check for a SGPR index.
5279 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5281 const DebugLoc &DL = MI.getDebugLoc();
5282
5283 if (UseGPRIdxMode) {
5284 // TODO: Look at the uses to avoid the copy. This may require rescheduling
5285 // to avoid interfering with other uses, so probably requires a new
5286 // optimization pass.
5288
5289 const MCInstrDesc &GPRIDXDesc =
5290 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
5291 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
5292 .addReg(SrcReg)
5293 .addReg(Idx)
5294 .addImm(SubReg);
5295 } else {
5297
5298 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
5299 .addReg(SrcReg, {}, SubReg)
5300 .addReg(SrcReg, RegState::Implicit);
5301 }
5302
5303 MI.eraseFromParent();
5304
5305 return &MBB;
5306 }
5307
5308 // Control flow needs to be inserted if indexing with a VGPR.
5309 const DebugLoc &DL = MI.getDebugLoc();
5311
5312 Register PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5313 Register InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5314
5315 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
5316
5317 Register SGPRIdxReg;
5318 auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset,
5319 UseGPRIdxMode, SGPRIdxReg);
5320
5321 MachineBasicBlock *LoopBB = InsPt->getParent();
5322
5323 if (UseGPRIdxMode) {
5324 const MCInstrDesc &GPRIDXDesc =
5325 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
5326
5327 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
5328 .addReg(SrcReg)
5329 .addReg(SGPRIdxReg)
5330 .addImm(SubReg);
5331 } else {
5332 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
5333 .addReg(SrcReg, {}, SubReg)
5334 .addReg(SrcReg, RegState::Implicit);
5335 }
5336
5337 MI.eraseFromParent();
5338
5339 return LoopBB;
5340}
5341
5344 const GCNSubtarget &ST) {
5345 const SIInstrInfo *TII = ST.getInstrInfo();
5346 const SIRegisterInfo &TRI = TII->getRegisterInfo();
5347 MachineFunction *MF = MBB.getParent();
5349
5350 Register Dst = MI.getOperand(0).getReg();
5351 const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
5352 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5353 const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
5354 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
5355 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
5356 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
5357
5358 // This can be an immediate, but will be folded later.
5359 assert(Val->getReg());
5360
5361 unsigned SubReg;
5362 std::tie(SubReg, Offset) =
5363 computeIndirectRegAndOffset(TRI, VecRC, SrcVec->getReg(), Offset);
5364 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5365
5366 if (Idx->getReg() == AMDGPU::NoRegister) {
5368 const DebugLoc &DL = MI.getDebugLoc();
5369
5370 assert(Offset == 0);
5371
5372 BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
5373 .add(*SrcVec)
5374 .add(*Val)
5375 .addImm(SubReg);
5376
5377 MI.eraseFromParent();
5378 return &MBB;
5379 }
5380
5381 // Check for a SGPR index.
5382 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5384 const DebugLoc &DL = MI.getDebugLoc();
5385
5386 if (UseGPRIdxMode) {
5388
5389 const MCInstrDesc &GPRIDXDesc =
5390 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
5391 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
5392 .addReg(SrcVec->getReg())
5393 .add(*Val)
5394 .addReg(Idx)
5395 .addImm(SubReg);
5396 } else {
5398
5399 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
5400 TRI.getRegSizeInBits(*VecRC), 32, false);
5401 BuildMI(MBB, I, DL, MovRelDesc, Dst)
5402 .addReg(SrcVec->getReg())
5403 .add(*Val)
5404 .addImm(SubReg);
5405 }
5406 MI.eraseFromParent();
5407 return &MBB;
5408 }
5409
5410 // Control flow needs to be inserted if indexing with a VGPR.
5411 if (Val->isReg())
5412 MRI.clearKillFlags(Val->getReg());
5413
5414 const DebugLoc &DL = MI.getDebugLoc();
5415
5416 Register PhiReg = MRI.createVirtualRegister(VecRC);
5417
5418 Register SGPRIdxReg;
5419 auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, Offset,
5420 UseGPRIdxMode, SGPRIdxReg);
5421 MachineBasicBlock *LoopBB = InsPt->getParent();
5422
5423 if (UseGPRIdxMode) {
5424 const MCInstrDesc &GPRIDXDesc =
5425 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
5426
5427 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
5428 .addReg(PhiReg)
5429 .add(*Val)
5430 .addReg(SGPRIdxReg)
5431 .addImm(SubReg);
5432 } else {
5433 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
5434 TRI.getRegSizeInBits(*VecRC), 32, false);
5435 BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst)
5436 .addReg(PhiReg)
5437 .add(*Val)
5438 .addImm(SubReg);
5439 }
5440
5441 MI.eraseFromParent();
5442 return LoopBB;
5443}
5444
5446 MachineBasicBlock *BB) {
5447 // For targets older than GFX12, we emit a sequence of 32-bit operations.
5448 // For GFX12, we emit s_add_u64 and s_sub_u64.
5449 MachineFunction *MF = BB->getParent();
5450 const SIInstrInfo *TII = MF->getSubtarget<GCNSubtarget>().getInstrInfo();
5451 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5453 const DebugLoc &DL = MI.getDebugLoc();
5454 MachineOperand &Dest = MI.getOperand(0);
5455 MachineOperand &Src0 = MI.getOperand(1);
5456 MachineOperand &Src1 = MI.getOperand(2);
5457 bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5458 if (ST.hasScalarAddSub64()) {
5459 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5460 // clang-format off
5461 BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
5462 .add(Src0)
5463 .add(Src1);
5464 // clang-format on
5465 } else {
5466 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5467 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5468
5469 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5470 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5471
5472 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5473 MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5474 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5475 MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5476
5477 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5478 MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5479 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5480 MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5481
5482 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5483 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5484 BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0).add(Src0Sub0).add(Src1Sub0);
5485 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1).add(Src0Sub1).add(Src1Sub1);
5486 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5487 .addReg(DestSub0)
5488 .addImm(AMDGPU::sub0)
5489 .addReg(DestSub1)
5490 .addImm(AMDGPU::sub1);
5491 }
5492 MI.eraseFromParent();
5493 return BB;
5494}
5495
5497 switch (Opc) {
5498 case AMDGPU::S_MIN_U32:
5499 return std::numeric_limits<uint32_t>::max();
5500 case AMDGPU::S_MIN_I32:
5501 return std::numeric_limits<int32_t>::max();
5502 case AMDGPU::S_MAX_U32:
5503 return std::numeric_limits<uint32_t>::min();
5504 case AMDGPU::S_MAX_I32:
5505 return std::numeric_limits<int32_t>::min();
5506 case AMDGPU::V_ADD_F32_e64: // -0.0
5507 return 0x80000000;
5508 case AMDGPU::V_SUB_F32_e64: // +0.0
5509 return 0x0;
5510 case AMDGPU::S_ADD_I32:
5511 case AMDGPU::S_SUB_I32:
5512 case AMDGPU::S_OR_B32:
5513 case AMDGPU::S_XOR_B32:
5514 return std::numeric_limits<uint32_t>::min();
5515 case AMDGPU::S_AND_B32:
5516 return std::numeric_limits<uint32_t>::max();
5517 case AMDGPU::V_MIN_F32_e64:
5518 case AMDGPU::V_MAX_F32_e64:
5519 return 0x7fc00000; // qNAN
5520 default:
5522 "Unexpected opcode in getIdentityValueFor32BitWaveReduction");
5523 }
5524}
5525
5527 switch (Opc) {
5528 case AMDGPU::V_CMP_LT_U64_e64: // umin.u64
5529 return std::numeric_limits<uint64_t>::max();
5530 case AMDGPU::V_CMP_LT_I64_e64: // min.i64
5531 return std::numeric_limits<int64_t>::max();
5532 case AMDGPU::V_CMP_GT_U64_e64: // umax.u64
5533 return std::numeric_limits<uint64_t>::min();
5534 case AMDGPU::V_CMP_GT_I64_e64: // max.i64
5535 return std::numeric_limits<int64_t>::min();
5536 case AMDGPU::V_MIN_F64_e64:
5537 case AMDGPU::V_MAX_F64_e64:
5538 case AMDGPU::V_MIN_NUM_F64_e64:
5539 case AMDGPU::V_MAX_NUM_F64_e64:
5540 return 0x7FF8000000000000; // qNAN
5541 case AMDGPU::S_ADD_U64_PSEUDO:
5542 case AMDGPU::S_SUB_U64_PSEUDO:
5543 case AMDGPU::S_OR_B64:
5544 case AMDGPU::S_XOR_B64:
5545 return std::numeric_limits<uint64_t>::min();
5546 case AMDGPU::S_AND_B64:
5547 return std::numeric_limits<uint64_t>::max();
5548 case AMDGPU::V_ADD_F64_e64:
5549 case AMDGPU::V_ADD_F64_pseudo_e64:
5550 return 0x8000000000000000; // -0.0
5551 default:
5553 "Unexpected opcode in getIdentityValueFor64BitWaveReduction");
5554 }
5555}
5556
5557static bool is32bitWaveReduceOperation(unsigned Opc) {
5558 return Opc == AMDGPU::S_MIN_U32 || Opc == AMDGPU::S_MIN_I32 ||
5559 Opc == AMDGPU::S_MAX_U32 || Opc == AMDGPU::S_MAX_I32 ||
5560 Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32 ||
5561 Opc == AMDGPU::S_AND_B32 || Opc == AMDGPU::S_OR_B32 ||
5562 Opc == AMDGPU::S_XOR_B32 || Opc == AMDGPU::V_MIN_F32_e64 ||
5563 Opc == AMDGPU::V_MAX_F32_e64 || Opc == AMDGPU::V_ADD_F32_e64 ||
5564 Opc == AMDGPU::V_SUB_F32_e64;
5565}
5566
5568 return Opc == AMDGPU::V_MIN_F32_e64 || Opc == AMDGPU::V_MAX_F32_e64 ||
5569 Opc == AMDGPU::V_ADD_F32_e64 || Opc == AMDGPU::V_SUB_F32_e64 ||
5570 Opc == AMDGPU::V_MIN_F64_e64 || Opc == AMDGPU::V_MAX_F64_e64 ||
5571 Opc == AMDGPU::V_MIN_NUM_F64_e64 || Opc == AMDGPU::V_MAX_NUM_F64_e64 ||
5572 Opc == AMDGPU::V_ADD_F64_e64 || Opc == AMDGPU::V_ADD_F64_pseudo_e64;
5573}
5574
5577 const GCNSubtarget &ST,
5578 unsigned Opc) {
5580 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5581 const DebugLoc &DL = MI.getDebugLoc();
5582 const SIInstrInfo *TII = ST.getInstrInfo();
5583
5584 // Reduction operations depend on whether the input operand is SGPR or VGPR.
5585 Register SrcReg = MI.getOperand(1).getReg();
5586 bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg));
5587 Register DstReg = MI.getOperand(0).getReg();
5588 MachineBasicBlock *RetBB = nullptr;
5589 if (isSGPR) {
5590 switch (Opc) {
5591 case AMDGPU::S_MIN_U32:
5592 case AMDGPU::S_MIN_I32:
5593 case AMDGPU::V_MIN_F32_e64:
5594 case AMDGPU::S_MAX_U32:
5595 case AMDGPU::S_MAX_I32:
5596 case AMDGPU::V_MAX_F32_e64:
5597 case AMDGPU::S_AND_B32:
5598 case AMDGPU::S_OR_B32: {
5599 // Idempotent operations.
5600 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
5601 RetBB = &BB;
5602 break;
5603 }
5604 case AMDGPU::V_CMP_LT_U64_e64: // umin
5605 case AMDGPU::V_CMP_LT_I64_e64: // min
5606 case AMDGPU::V_CMP_GT_U64_e64: // umax
5607 case AMDGPU::V_CMP_GT_I64_e64: // max
5608 case AMDGPU::V_MIN_F64_e64:
5609 case AMDGPU::V_MIN_NUM_F64_e64:
5610 case AMDGPU::V_MAX_F64_e64:
5611 case AMDGPU::V_MAX_NUM_F64_e64:
5612 case AMDGPU::S_AND_B64:
5613 case AMDGPU::S_OR_B64: {
5614 // Idempotent operations.
5615 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B64), DstReg).addReg(SrcReg);
5616 RetBB = &BB;
5617 break;
5618 }
5619 case AMDGPU::S_XOR_B32:
5620 case AMDGPU::S_XOR_B64:
5621 case AMDGPU::S_ADD_I32:
5622 case AMDGPU::S_ADD_U64_PSEUDO:
5623 case AMDGPU::V_ADD_F32_e64:
5624 case AMDGPU::V_ADD_F64_e64:
5625 case AMDGPU::V_ADD_F64_pseudo_e64:
5626 case AMDGPU::S_SUB_I32:
5627 case AMDGPU::S_SUB_U64_PSEUDO:
5628 case AMDGPU::V_SUB_F32_e64: {
5629 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
5630 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
5631 Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
5632 Register NumActiveLanes =
5633 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5634
5635 bool IsWave32 = ST.isWave32();
5636 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5637 MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5638 unsigned BitCountOpc =
5639 IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
5640
5641 BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
5642
5643 auto NewAccumulator =
5644 BuildMI(BB, MI, DL, TII->get(BitCountOpc), NumActiveLanes)
5645 .addReg(ExecMask);
5646
5647 switch (Opc) {
5648 case AMDGPU::S_XOR_B32:
5649 case AMDGPU::S_XOR_B64: {
5650 // Performing an XOR operation on a uniform value
5651 // depends on the parity of the number of active lanes.
5652 // For even parity, the result will be 0, for odd
5653 // parity the result will be the same as the input value.
5654 Register ParityRegister =
5655 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5656
5657 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
5658 .addReg(NewAccumulator->getOperand(0).getReg())
5659 .addImm(1)
5660 .setOperandDead(3); // Dead scc
5661 if (Opc == AMDGPU::S_XOR_B32) {
5662 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5663 .addReg(SrcReg)
5664 .addReg(ParityRegister);
5665 } else {
5666 Register DestSub0 =
5667 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5668 Register DestSub1 =
5669 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5670
5671 const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
5672 const TargetRegisterClass *SrcSubRC =
5673 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5674
5675 MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5676 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5677 MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5678 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5679
5680 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5681 .add(Op1L)
5682 .addReg(ParityRegister);
5683
5684 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub1)
5685 .add(Op1H)
5686 .addReg(ParityRegister);
5687
5688 BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5689 .addReg(DestSub0)
5690 .addImm(AMDGPU::sub0)
5691 .addReg(DestSub1)
5692 .addImm(AMDGPU::sub1);
5693 }
5694 break;
5695 }
5696 case AMDGPU::S_SUB_I32: {
5697 Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
5698
5699 // Take the negation of the source operand.
5700 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedVal)
5701 .addImm(0)
5702 .addReg(SrcReg);
5703 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5704 .addReg(NegatedVal)
5705 .addReg(NewAccumulator->getOperand(0).getReg());
5706 break;
5707 }
5708 case AMDGPU::S_ADD_I32: {
5709 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5710 .addReg(SrcReg)
5711 .addReg(NewAccumulator->getOperand(0).getReg());
5712 break;
5713 }
5714 case AMDGPU::S_ADD_U64_PSEUDO:
5715 case AMDGPU::S_SUB_U64_PSEUDO: {
5716 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5717 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5718 Register Op1H_Op0L_Reg =
5719 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5720 Register Op1L_Op0H_Reg =
5721 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5722 Register CarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5723 Register AddReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5724 Register NegatedValLo =
5725 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5726 Register NegatedValHi =
5727 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5728
5729 const TargetRegisterClass *Src1RC = MRI.getRegClass(SrcReg);
5730 const TargetRegisterClass *Src1SubRC =
5731 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub0);
5732
5733 MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5734 MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub0, Src1SubRC);
5735 MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5736 MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub1, Src1SubRC);
5737
5738 if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5739 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedValLo)
5740 .addImm(0)
5741 .addReg(NewAccumulator->getOperand(0).getReg())
5742 .setOperandDead(3); // Dead scc
5743 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ASHR_I32), NegatedValHi)
5744 .addReg(NegatedValLo)
5745 .addImm(31)
5746 .setOperandDead(3); // Dead scc
5747 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1L_Op0H_Reg)
5748 .add(Op1L)
5749 .addReg(NegatedValHi);
5750 }
5751 Register LowOpcode = Opc == AMDGPU::S_SUB_U64_PSEUDO
5752 ? NegatedValLo
5753 : NewAccumulator->getOperand(0).getReg();
5754 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5755 .add(Op1L)
5756 .addReg(LowOpcode);
5757 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_HI_U32), CarryReg)
5758 .add(Op1L)
5759 .addReg(LowOpcode);
5760 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1H_Op0L_Reg)
5761 .add(Op1H)
5762 .addReg(LowOpcode);
5763
5764 Register HiVal = Opc == AMDGPU::S_SUB_U64_PSEUDO ? AddReg : DestSub1;
5765 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), HiVal)
5766 .addReg(CarryReg)
5767 .addReg(Op1H_Op0L_Reg)
5768 .setOperandDead(3); // Dead scc
5769
5770 if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5771 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), DestSub1)
5772 .addReg(HiVal)
5773 .addReg(Op1L_Op0H_Reg)
5774 .setOperandDead(3); // Dead scc
5775 }
5776 BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5777 .addReg(DestSub0)
5778 .addImm(AMDGPU::sub0)
5779 .addReg(DestSub1)
5780 .addImm(AMDGPU::sub1);
5781 break;
5782 }
5783 case AMDGPU::V_ADD_F32_e64:
5784 case AMDGPU::V_ADD_F64_e64:
5785 case AMDGPU::V_ADD_F64_pseudo_e64:
5786 case AMDGPU::V_SUB_F32_e64: {
5787 bool is32BitOpc = is32bitWaveReduceOperation(Opc);
5788 const TargetRegisterClass *VregRC = TII->getRegClass(TII->get(Opc), 0);
5789 Register ActiveLanesVreg = MRI.createVirtualRegister(VregRC);
5790 Register DstVreg = MRI.createVirtualRegister(VregRC);
5791 // Get number of active lanes as a float val.
5792 BuildMI(BB, MI, DL,
5793 TII->get(is32BitOpc ? AMDGPU::V_CVT_F32_I32_e64
5794 : AMDGPU::V_CVT_F64_I32_e64),
5795 ActiveLanesVreg)
5796 .addReg(NewAccumulator->getOperand(0).getReg())
5797 .addImm(0) // clamp
5798 .addImm(0); // output-modifier
5799
5800 // Take negation of input for SUB reduction
5801 unsigned srcMod =
5802 (Opc == AMDGPU::V_SUB_F32_e64 ||
5803 MI.getOpcode() == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64)
5806 unsigned MulOpc = is32BitOpc ? AMDGPU::V_MUL_F32_e64
5807 : ST.getGeneration() >= AMDGPUSubtarget::GFX12
5808 ? AMDGPU::V_MUL_F64_pseudo_e64
5809 : AMDGPU::V_MUL_F64_e64;
5810 auto DestVregInst = BuildMI(BB, MI, DL, TII->get(MulOpc),
5811 DstVreg)
5812 .addImm(srcMod) // src0 modifier
5813 .addReg(SrcReg)
5814 .addImm(SISrcMods::NONE) // src1 modifier
5815 .addReg(ActiveLanesVreg)
5816 .addImm(SISrcMods::NONE) // clamp
5817 .addImm(SISrcMods::NONE); // output-mod
5818 if (is32BitOpc) {
5819 BuildMI(BB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
5820 .addReg(DstVreg);
5821 } else {
5822 Register LaneValueLoReg =
5823 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5824 Register LaneValueHiReg =
5825 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5826 const TargetRegisterClass *VregSubRC =
5827 TRI->getSubRegisterClass(VregRC, AMDGPU::sub0);
5828 MachineOperand Op1L =
5829 TII->buildExtractSubRegOrImm(MI, MRI, DestVregInst->getOperand(0),
5830 VregRC, AMDGPU::sub0, VregSubRC);
5831 MachineOperand Op1H =
5832 TII->buildExtractSubRegOrImm(MI, MRI, DestVregInst->getOperand(0),
5833 VregRC, AMDGPU::sub1, VregSubRC);
5834 // lane value input should be in an sgpr
5835 BuildMI(BB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
5836 LaneValueLoReg)
5837 .add(Op1L);
5838 BuildMI(BB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
5839 LaneValueHiReg)
5840 .add(Op1H);
5841 NewAccumulator =
5842 BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5843 .addReg(LaneValueLoReg)
5844 .addImm(AMDGPU::sub0)
5845 .addReg(LaneValueHiReg)
5846 .addImm(AMDGPU::sub1);
5847 }
5848 }
5849 }
5850 RetBB = &BB;
5851 }
5852 }
5853 } else {
5854 // TODO: Implement DPP Strategy and switch based on immediate strategy
5855 // operand. For now, for all the cases (default, Iterative and DPP we use
5856 // iterative approach by default.)
5857
5858 // To reduce the VGPR using iterative approach, we need to iterate
5859 // over all the active lanes. Lowering consists of ComputeLoop,
5860 // which iterate over only active lanes. We use copy of EXEC register
5861 // as induction variable and every active lane modifies it using bitset0
5862 // so that we will get the next active lane for next iteration.
5864 Register SrcReg = MI.getOperand(1).getReg();
5865 bool is32BitOpc = is32bitWaveReduceOperation(Opc);
5867
5868 // Create Control flow for loop
5869 // Split MI's Machine Basic block into For loop
5870 auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);
5871
5872 // Create virtual registers required for lowering.
5873 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
5874 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
5875 Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
5876 Register IdentityValReg = MRI.createVirtualRegister(DstRegClass);
5877 Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
5878 Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
5879 Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
5880 Register FF1Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5881 Register LaneValueReg = MRI.createVirtualRegister(DstRegClass);
5882
5883 bool IsWave32 = ST.isWave32();
5884 unsigned MovOpcForExec = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5885 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5886
5887 // Create initial values of induction variable from Exec, Accumulator and
5888 // insert branch instr to newly created ComputeBlock
5889 BuildMI(BB, I, DL, TII->get(MovOpcForExec), LoopIterator).addReg(ExecReg);
5890 if (is32BitOpc) {
5892 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), IdentityValReg)
5893 .addImm(IdentityValue);
5894 } else {
5895 uint64_t IdentityValue =
5896 MI.getOpcode() == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64
5897 ? 0x0 // +0.0 for double sub reduction
5899 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B64_IMM_PSEUDO), IdentityValReg)
5900 .addImm(IdentityValue);
5901 }
5902 // clang-format off
5903 BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH))
5904 .addMBB(ComputeLoop);
5905 // clang-format on
5906
5907 // Start constructing ComputeLoop
5908 I = ComputeLoop->begin();
5909 auto Accumulator =
5910 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
5911 .addReg(IdentityValReg)
5912 .addMBB(&BB);
5913 auto ActiveBits =
5914 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
5915 .addReg(LoopIterator)
5916 .addMBB(&BB);
5917
5918 I = ComputeLoop->end();
5919 MachineInstr *NewAccumulator;
5920 // Perform the computations
5921 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
5922 BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
5923 .addReg(ActiveBitsReg);
5924 if (is32BitOpc) {
5925 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5926 LaneValueReg)
5927 .addReg(SrcReg)
5928 .addReg(FF1Reg);
5929 if (isFPOp) {
5930 Register LaneValVreg =
5931 MRI.createVirtualRegister(MRI.getRegClass(SrcReg));
5932 Register DstVreg = MRI.createVirtualRegister(MRI.getRegClass(SrcReg));
5933 // Get the Lane Value in VGPR to avoid the Constant Bus Restriction
5934 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_MOV_B32_e32),
5935 LaneValVreg)
5936 .addReg(LaneValueReg);
5937 BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstVreg)
5938 .addImm(0) // src0 modifier
5939 .addReg(Accumulator->getOperand(0).getReg())
5940 .addImm(0) // src1 modifier
5941 .addReg(LaneValVreg)
5942 .addImm(0) // clamp
5943 .addImm(0); // omod
5944 NewAccumulator = BuildMI(*ComputeLoop, I, DL,
5945 TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
5946 .addReg(DstVreg);
5947 } else {
5948 NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5949 .addReg(Accumulator->getOperand(0).getReg())
5950 .addReg(LaneValueReg);
5951 }
5952 } else {
5953 Register LaneValueLoReg =
5954 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5955 Register LaneValueHiReg =
5956 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5957 Register LaneValReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5958 const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
5959 const TargetRegisterClass *SrcSubRC =
5960 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5961 MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5962 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5963 MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5964 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5965 // lane value input should be in an sgpr
5966 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5967 LaneValueLoReg)
5968 .add(Op1L)
5969 .addReg(FF1Reg);
5970 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5971 LaneValueHiReg)
5972 .add(Op1H)
5973 .addReg(FF1Reg);
5974 auto LaneValue = BuildMI(*ComputeLoop, I, DL,
5975 TII->get(TargetOpcode::REG_SEQUENCE), LaneValReg)
5976 .addReg(LaneValueLoReg)
5977 .addImm(AMDGPU::sub0)
5978 .addReg(LaneValueHiReg)
5979 .addImm(AMDGPU::sub1);
5980 switch (Opc) {
5981 case AMDGPU::S_OR_B64:
5982 case AMDGPU::S_AND_B64:
5983 case AMDGPU::S_XOR_B64: {
5984 NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5985 .addReg(Accumulator->getOperand(0).getReg())
5986 .addReg(LaneValue->getOperand(0).getReg())
5987 .setOperandDead(3); // Dead scc
5988 break;
5989 }
5990 case AMDGPU::V_CMP_GT_I64_e64:
5991 case AMDGPU::V_CMP_GT_U64_e64:
5992 case AMDGPU::V_CMP_LT_I64_e64:
5993 case AMDGPU::V_CMP_LT_U64_e64: {
5994 Register LaneMaskReg = MRI.createVirtualRegister(WaveMaskRegClass);
5995 Register ComparisonResultReg =
5996 MRI.createVirtualRegister(WaveMaskRegClass);
5997 int SrcIdx =
5998 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src);
5999 const TargetRegisterClass *VregClass =
6000 TRI->getAllocatableClass(TII->getRegClass(MI.getDesc(), SrcIdx));
6001 const TargetRegisterClass *VSubRegClass =
6002 TRI->getSubRegisterClass(VregClass, AMDGPU::sub0);
6003 Register AccumulatorVReg = MRI.createVirtualRegister(VregClass);
6004 MachineOperand SrcReg0Sub0 =
6005 TII->buildExtractSubRegOrImm(MI, MRI, Accumulator->getOperand(0),
6006 VregClass, AMDGPU::sub0, VSubRegClass);
6007 MachineOperand SrcReg0Sub1 =
6008 TII->buildExtractSubRegOrImm(MI, MRI, Accumulator->getOperand(0),
6009 VregClass, AMDGPU::sub1, VSubRegClass);
6010 BuildMI(*ComputeLoop, I, DL, TII->get(TargetOpcode::REG_SEQUENCE),
6011 AccumulatorVReg)
6012 .add(SrcReg0Sub0)
6013 .addImm(AMDGPU::sub0)
6014 .add(SrcReg0Sub1)
6015 .addImm(AMDGPU::sub1);
6016 BuildMI(*ComputeLoop, I, DL, TII->get(Opc), LaneMaskReg)
6017 .addReg(LaneValue->getOperand(0).getReg())
6018 .addReg(AccumulatorVReg);
6019
6020 unsigned AndOpc = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
6021 BuildMI(*ComputeLoop, I, DL, TII->get(AndOpc), ComparisonResultReg)
6022 .addReg(LaneMaskReg)
6023 .addReg(ActiveBitsReg);
6024
6025 NewAccumulator = BuildMI(*ComputeLoop, I, DL,
6026 TII->get(AMDGPU::S_CSELECT_B64), DstReg)
6027 .addReg(LaneValue->getOperand(0).getReg())
6028 .addReg(Accumulator->getOperand(0).getReg());
6029 break;
6030 }
6031 case AMDGPU::V_MIN_F64_e64:
6032 case AMDGPU::V_MIN_NUM_F64_e64:
6033 case AMDGPU::V_MAX_F64_e64:
6034 case AMDGPU::V_MAX_NUM_F64_e64:
6035 case AMDGPU::V_ADD_F64_e64:
6036 case AMDGPU::V_ADD_F64_pseudo_e64: {
6037 int SrcIdx =
6038 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src);
6039 const TargetRegisterClass *VregRC =
6040 TRI->getAllocatableClass(TII->getRegClass(MI.getDesc(), SrcIdx));
6041 const TargetRegisterClass *VregSubRC =
6042 TRI->getSubRegisterClass(VregRC, AMDGPU::sub0);
6043 Register AccumulatorVReg = MRI.createVirtualRegister(VregRC);
6044 Register DstVreg = MRI.createVirtualRegister(VregRC);
6045 Register LaneValLo =
6046 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6047 Register LaneValHi =
6048 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6049 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::COPY), AccumulatorVReg)
6050 .addReg(Accumulator->getOperand(0).getReg());
6051 unsigned Modifier =
6052 MI.getOpcode() == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64
6055 auto DstVregInst = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstVreg)
6056 .addImm(Modifier) // src0 modifiers
6057 .addReg(LaneValue->getOperand(0).getReg())
6058 .addImm(SISrcMods::NONE) // src1 modifiers
6059 .addReg(AccumulatorVReg)
6060 .addImm(SISrcMods::NONE) // clamp
6061 .addImm(SISrcMods::NONE); // omod
6062 auto ReadLaneLo =
6063 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
6064 LaneValLo);
6065 auto ReadLaneHi =
6066 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
6067 LaneValHi);
6068 MachineBasicBlock::iterator Iters = *ReadLaneLo;
6069 MachineOperand Op1L =
6070 TII->buildExtractSubRegOrImm(Iters, MRI, DstVregInst->getOperand(0),
6071 VregRC, AMDGPU::sub0, VregSubRC);
6072 MachineOperand Op1H =
6073 TII->buildExtractSubRegOrImm(Iters, MRI, DstVregInst->getOperand(0),
6074 VregRC, AMDGPU::sub1, VregSubRC);
6075 ReadLaneLo.add(Op1L);
6076 ReadLaneHi.add(Op1H);
6077 NewAccumulator = BuildMI(*ComputeLoop, I, DL,
6078 TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
6079 .addReg(LaneValLo)
6080 .addImm(AMDGPU::sub0)
6081 .addReg(LaneValHi)
6082 .addImm(AMDGPU::sub1);
6083 break;
6084 }
6085 case AMDGPU::S_ADD_U64_PSEUDO:
6086 case AMDGPU::S_SUB_U64_PSEUDO: {
6087 NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
6088 .addReg(Accumulator->getOperand(0).getReg())
6089 .addReg(LaneValue->getOperand(0).getReg());
6090 ComputeLoop = Expand64BitScalarArithmetic(*NewAccumulator, ComputeLoop);
6091 break;
6092 }
6093 }
6094 }
6095 // Manipulate the iterator to get the next active lane
6096 unsigned BITSETOpc =
6097 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
6098 BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
6099 .addReg(FF1Reg)
6100 .addReg(ActiveBitsReg);
6101
6102 // Add phi nodes
6103 Accumulator.addReg(DstReg).addMBB(ComputeLoop);
6104 ActiveBits.addReg(NewActiveBitsReg).addMBB(ComputeLoop);
6105
6106 // Creating branching
6107 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
6108 BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
6109 .addReg(NewActiveBitsReg)
6110 .addImm(0);
6111 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
6112 .addMBB(ComputeLoop);
6113
6114 RetBB = ComputeEnd;
6115 }
6116 MI.eraseFromParent();
6117 return RetBB;
6118}
6119
6122 MachineBasicBlock *BB) const {
6123 MachineFunction *MF = BB->getParent();
6125 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
6127 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
6129 const DebugLoc &DL = MI.getDebugLoc();
6130
6131 switch (MI.getOpcode()) {
6132 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
6133 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
6134 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U64:
6135 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_U64_e64);
6136 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
6137 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32);
6138 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I64:
6139 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_I64_e64);
6140 case AMDGPU::WAVE_REDUCE_FMIN_PSEUDO_F32:
6141 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_MIN_F32_e64);
6142 case AMDGPU::WAVE_REDUCE_FMIN_PSEUDO_F64:
6143 return lowerWaveReduce(MI, *BB, *getSubtarget(),
6144 ST.getGeneration() >= AMDGPUSubtarget::GFX12
6145 ? AMDGPU::V_MIN_NUM_F64_e64
6146 : AMDGPU::V_MIN_F64_e64);
6147 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
6148 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
6149 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U64:
6150 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_U64_e64);
6151 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
6152 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32);
6153 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I64:
6154 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_I64_e64);
6155 case AMDGPU::WAVE_REDUCE_FMAX_PSEUDO_F32:
6156 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_MAX_F32_e64);
6157 case AMDGPU::WAVE_REDUCE_FMAX_PSEUDO_F64:
6158 return lowerWaveReduce(MI, *BB, *getSubtarget(),
6159 ST.getGeneration() >= AMDGPUSubtarget::GFX12
6160 ? AMDGPU::V_MAX_NUM_F64_e64
6161 : AMDGPU::V_MAX_F64_e64);
6162 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
6163 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
6164 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
6165 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_U64_PSEUDO);
6166 case AMDGPU::WAVE_REDUCE_FADD_PSEUDO_F32:
6167 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_ADD_F32_e64);
6168 case AMDGPU::WAVE_REDUCE_FADD_PSEUDO_F64:
6169 return lowerWaveReduce(MI, *BB, *getSubtarget(),
6170 ST.getGeneration() >= AMDGPUSubtarget::GFX12
6171 ? AMDGPU::V_ADD_F64_pseudo_e64
6172 : AMDGPU::V_ADD_F64_e64);
6173 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
6174 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
6175 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
6176 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U64_PSEUDO);
6177 case AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F32:
6178 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_SUB_F32_e64);
6179 case AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64:
6180 // There is no S/V_SUB_F64 opcode. Double type subtraction is expanded as
6181 // fadd + neg, by setting the NEG bit in the instruction.
6182 return lowerWaveReduce(MI, *BB, *getSubtarget(),
6183 ST.getGeneration() >= AMDGPUSubtarget::GFX12
6184 ? AMDGPU::V_ADD_F64_pseudo_e64
6185 : AMDGPU::V_ADD_F64_e64);
6186 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
6187 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
6188 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64:
6189 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B64);
6190 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
6191 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
6192 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B64:
6193 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B64);
6194 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
6195 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B32);
6196 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B64:
6197 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B64);
6198 case AMDGPU::S_UADDO_PSEUDO:
6199 case AMDGPU::S_USUBO_PSEUDO: {
6200 MachineOperand &Dest0 = MI.getOperand(0);
6201 MachineOperand &Dest1 = MI.getOperand(1);
6202 MachineOperand &Src0 = MI.getOperand(2);
6203 MachineOperand &Src1 = MI.getOperand(3);
6204
6205 unsigned Opc = (MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
6206 ? AMDGPU::S_ADD_U32
6207 : AMDGPU::S_SUB_U32;
6208 // clang-format off
6209 BuildMI(*BB, MI, DL, TII->get(Opc), Dest0.getReg())
6210 .add(Src0)
6211 .add(Src1);
6212 // clang-format on
6213
6214 unsigned SelOpc =
6215 Subtarget->isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6216 BuildMI(*BB, MI, DL, TII->get(SelOpc), Dest1.getReg()).addImm(-1).addImm(0);
6217
6218 MI.eraseFromParent();
6219 return BB;
6220 }
6221 case AMDGPU::S_ADD_U64_PSEUDO:
6222 case AMDGPU::S_SUB_U64_PSEUDO: {
6223 return Expand64BitScalarArithmetic(MI, BB);
6224 }
6225 case AMDGPU::V_ADD_U64_PSEUDO:
6226 case AMDGPU::V_SUB_U64_PSEUDO: {
6227 bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
6228
6229 MachineOperand &Dest = MI.getOperand(0);
6230 MachineOperand &Src0 = MI.getOperand(1);
6231 MachineOperand &Src1 = MI.getOperand(2);
6232
6233 if (ST.hasAddSubU64Insts()) {
6234 auto I = BuildMI(*BB, MI, DL,
6235 TII->get(IsAdd ? AMDGPU::V_ADD_U64_e64
6236 : AMDGPU::V_SUB_U64_e64),
6237 Dest.getReg())
6238 .add(Src0)
6239 .add(Src1)
6240 .addImm(0); // clamp
6241 TII->legalizeOperands(*I);
6242 MI.eraseFromParent();
6243 return BB;
6244 }
6245
6246 if (IsAdd && ST.hasLshlAddU64Inst()) {
6247 auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64),
6248 Dest.getReg())
6249 .add(Src0)
6250 .addImm(0)
6251 .add(Src1);
6252 TII->legalizeOperands(*Add);
6253 MI.eraseFromParent();
6254 return BB;
6255 }
6256
6257 const auto *CarryRC = TRI->getWaveMaskRegClass();
6258
6259 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6260 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6261
6262 Register CarryReg = MRI.createVirtualRegister(CarryRC);
6263 Register DeadCarryReg = MRI.createVirtualRegister(CarryRC);
6264
6265 const TargetRegisterClass *Src0RC = Src0.isReg()
6266 ? MRI.getRegClass(Src0.getReg())
6267 : &AMDGPU::VReg_64RegClass;
6268 const TargetRegisterClass *Src1RC = Src1.isReg()
6269 ? MRI.getRegClass(Src1.getReg())
6270 : &AMDGPU::VReg_64RegClass;
6271
6272 const TargetRegisterClass *Src0SubRC =
6273 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6274 const TargetRegisterClass *Src1SubRC =
6275 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6276
6277 MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
6278 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6279 MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm(
6280 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6281
6282 MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
6283 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6284 MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm(
6285 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6286
6287 unsigned LoOpc =
6288 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
6289 MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
6290 .addReg(CarryReg, RegState::Define)
6291 .add(SrcReg0Sub0)
6292 .add(SrcReg1Sub0)
6293 .addImm(0); // clamp bit
6294
6295 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
6296 MachineInstr *HiHalf =
6297 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
6298 .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
6299 .add(SrcReg0Sub1)
6300 .add(SrcReg1Sub1)
6301 .addReg(CarryReg, RegState::Kill)
6302 .addImm(0); // clamp bit
6303
6304 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
6305 .addReg(DestSub0)
6306 .addImm(AMDGPU::sub0)
6307 .addReg(DestSub1)
6308 .addImm(AMDGPU::sub1);
6309 TII->legalizeOperands(*LoHalf);
6310 TII->legalizeOperands(*HiHalf);
6311 MI.eraseFromParent();
6312 return BB;
6313 }
6314 case AMDGPU::S_ADD_CO_PSEUDO:
6315 case AMDGPU::S_SUB_CO_PSEUDO: {
6316 // This pseudo has a chance to be selected
6317 // only from uniform add/subcarry node. All the VGPR operands
6318 // therefore assumed to be splat vectors.
6320 MachineOperand &Dest = MI.getOperand(0);
6321 MachineOperand &CarryDest = MI.getOperand(1);
6322 MachineOperand &Src0 = MI.getOperand(2);
6323 MachineOperand &Src1 = MI.getOperand(3);
6324 MachineOperand &Src2 = MI.getOperand(4);
6325 if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) {
6326 Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6327 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
6328 .addReg(Src0.getReg());
6329 Src0.setReg(RegOp0);
6330 }
6331 if (Src1.isReg() && TRI->isVectorRegister(MRI, Src1.getReg())) {
6332 Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6333 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
6334 .addReg(Src1.getReg());
6335 Src1.setReg(RegOp1);
6336 }
6337 Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6338 if (TRI->isVectorRegister(MRI, Src2.getReg())) {
6339 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
6340 .addReg(Src2.getReg());
6341 Src2.setReg(RegOp2);
6342 }
6343
6344 if (ST.isWave64()) {
6345 if (ST.hasScalarCompareEq64()) {
6346 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))
6347 .addReg(Src2.getReg())
6348 .addImm(0);
6349 } else {
6350 const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg());
6351 const TargetRegisterClass *SubRC =
6352 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
6353 MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm(
6354 MII, MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
6355 MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm(
6356 MII, MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
6357 Register Src2_32 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6358
6359 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_OR_B32), Src2_32)
6360 .add(Src2Sub0)
6361 .add(Src2Sub1);
6362
6363 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
6364 .addReg(Src2_32, RegState::Kill)
6365 .addImm(0);
6366 }
6367 } else {
6368 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
6369 .addReg(Src2.getReg())
6370 .addImm(0);
6371 }
6372
6373 unsigned Opc = MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO
6374 ? AMDGPU::S_ADDC_U32
6375 : AMDGPU::S_SUBB_U32;
6376
6377 BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg()).add(Src0).add(Src1);
6378
6379 unsigned SelOpc =
6380 ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6381
6382 BuildMI(*BB, MII, DL, TII->get(SelOpc), CarryDest.getReg())
6383 .addImm(-1)
6384 .addImm(0);
6385
6386 MI.eraseFromParent();
6387 return BB;
6388 }
6389 case AMDGPU::SI_INIT_M0: {
6390 MachineOperand &M0Init = MI.getOperand(0);
6391 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
6392 TII->get(M0Init.isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32),
6393 AMDGPU::M0)
6394 .add(M0Init);
6395 MI.eraseFromParent();
6396 return BB;
6397 }
6398 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM: {
6399 // Set SCC to true, in case the barrier instruction gets converted to a NOP.
6400 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
6401 TII->get(AMDGPU::S_CMP_EQ_U32))
6402 .addImm(0)
6403 .addImm(0);
6404 return BB;
6405 }
6406 case AMDGPU::GET_GROUPSTATICSIZE: {
6407 assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
6408 getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
6409 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
6410 .add(MI.getOperand(0))
6411 .addImm(MFI->getLDSSize());
6412 MI.eraseFromParent();
6413 return BB;
6414 }
6415 case AMDGPU::GET_SHADERCYCLESHILO: {
6416 assert(MF->getSubtarget<GCNSubtarget>().hasShaderCyclesHiLoRegisters());
6417 // The algorithm is:
6418 //
6419 // hi1 = getreg(SHADER_CYCLES_HI)
6420 // lo1 = getreg(SHADER_CYCLES_LO)
6421 // hi2 = getreg(SHADER_CYCLES_HI)
6422 //
6423 // If hi1 == hi2 then there was no overflow and the result is hi2:lo1.
6424 // Otherwise there was overflow and the result is hi2:0. In both cases the
6425 // result should represent the actual time at some point during the sequence
6426 // of three getregs.
6427 using namespace AMDGPU::Hwreg;
6428 Register RegHi1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6429 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi1)
6430 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6431 Register RegLo1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6432 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegLo1)
6433 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
6434 Register RegHi2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6435 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi2)
6436 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6437 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
6438 .addReg(RegHi1)
6439 .addReg(RegHi2);
6440 Register RegLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6441 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), RegLo)
6442 .addReg(RegLo1)
6443 .addImm(0);
6444 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE))
6445 .add(MI.getOperand(0))
6446 .addReg(RegLo)
6447 .addImm(AMDGPU::sub0)
6448 .addReg(RegHi2)
6449 .addImm(AMDGPU::sub1);
6450 MI.eraseFromParent();
6451 return BB;
6452 }
6453 case AMDGPU::SI_INDIRECT_SRC_V1:
6454 case AMDGPU::SI_INDIRECT_SRC_V2:
6455 case AMDGPU::SI_INDIRECT_SRC_V3:
6456 case AMDGPU::SI_INDIRECT_SRC_V4:
6457 case AMDGPU::SI_INDIRECT_SRC_V5:
6458 case AMDGPU::SI_INDIRECT_SRC_V6:
6459 case AMDGPU::SI_INDIRECT_SRC_V7:
6460 case AMDGPU::SI_INDIRECT_SRC_V8:
6461 case AMDGPU::SI_INDIRECT_SRC_V9:
6462 case AMDGPU::SI_INDIRECT_SRC_V10:
6463 case AMDGPU::SI_INDIRECT_SRC_V11:
6464 case AMDGPU::SI_INDIRECT_SRC_V12:
6465 case AMDGPU::SI_INDIRECT_SRC_V16:
6466 case AMDGPU::SI_INDIRECT_SRC_V32:
6467 return emitIndirectSrc(MI, *BB, *getSubtarget());
6468 case AMDGPU::SI_INDIRECT_DST_V1:
6469 case AMDGPU::SI_INDIRECT_DST_V2:
6470 case AMDGPU::SI_INDIRECT_DST_V3:
6471 case AMDGPU::SI_INDIRECT_DST_V4:
6472 case AMDGPU::SI_INDIRECT_DST_V5:
6473 case AMDGPU::SI_INDIRECT_DST_V6:
6474 case AMDGPU::SI_INDIRECT_DST_V7:
6475 case AMDGPU::SI_INDIRECT_DST_V8:
6476 case AMDGPU::SI_INDIRECT_DST_V9:
6477 case AMDGPU::SI_INDIRECT_DST_V10:
6478 case AMDGPU::SI_INDIRECT_DST_V11:
6479 case AMDGPU::SI_INDIRECT_DST_V12:
6480 case AMDGPU::SI_INDIRECT_DST_V16:
6481 case AMDGPU::SI_INDIRECT_DST_V32:
6482 return emitIndirectDst(MI, *BB, *getSubtarget());
6483 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
6484 case AMDGPU::SI_KILL_I1_PSEUDO:
6485 return splitKillBlock(MI, BB);
6486 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
6487 Register Dst = MI.getOperand(0).getReg();
6488 const MachineOperand &Src0 = MI.getOperand(1);
6489 const MachineOperand &Src1 = MI.getOperand(2);
6490 Register SrcCond = MI.getOperand(3).getReg();
6491
6492 Register DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6493 Register DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6494 const auto *CondRC = TRI->getWaveMaskRegClass();
6495 Register SrcCondCopy = MRI.createVirtualRegister(CondRC);
6496
6497 const TargetRegisterClass *Src0RC = Src0.isReg()
6498 ? MRI.getRegClass(Src0.getReg())
6499 : &AMDGPU::VReg_64RegClass;
6500 const TargetRegisterClass *Src1RC = Src1.isReg()
6501 ? MRI.getRegClass(Src1.getReg())
6502 : &AMDGPU::VReg_64RegClass;
6503
6504 const TargetRegisterClass *Src0SubRC =
6505 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6506 const TargetRegisterClass *Src1SubRC =
6507 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6508
6509 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
6510 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6511 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
6512 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6513
6514 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
6515 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6516 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
6517 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6518
6519 BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy).addReg(SrcCond);
6520 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
6521 .addImm(0)
6522 .add(Src0Sub0)
6523 .addImm(0)
6524 .add(Src1Sub0)
6525 .addReg(SrcCondCopy);
6526 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
6527 .addImm(0)
6528 .add(Src0Sub1)
6529 .addImm(0)
6530 .add(Src1Sub1)
6531 .addReg(SrcCondCopy);
6532
6533 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
6534 .addReg(DstLo)
6535 .addImm(AMDGPU::sub0)
6536 .addReg(DstHi)
6537 .addImm(AMDGPU::sub1);
6538 MI.eraseFromParent();
6539 return BB;
6540 }
6541 case AMDGPU::SI_BR_UNDEF: {
6542 MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
6543 .add(MI.getOperand(0));
6544 Br->getOperand(1).setIsUndef(); // read undef SCC
6545 MI.eraseFromParent();
6546 return BB;
6547 }
6548 case AMDGPU::ADJCALLSTACKUP:
6549 case AMDGPU::ADJCALLSTACKDOWN: {
6551 MachineInstrBuilder MIB(*MF, &MI);
6552 MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
6553 .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
6554 return BB;
6555 }
6556 case AMDGPU::SI_CALL_ISEL: {
6557 unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
6558
6560 MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
6561
6562 for (const MachineOperand &MO : MI.operands())
6563 MIB.add(MO);
6564
6565 MIB.cloneMemRefs(MI);
6566 MI.eraseFromParent();
6567 return BB;
6568 }
6569 case AMDGPU::V_ADD_CO_U32_e32:
6570 case AMDGPU::V_SUB_CO_U32_e32:
6571 case AMDGPU::V_SUBREV_CO_U32_e32: {
6572 // TODO: Define distinct V_*_I32_Pseudo instructions instead.
6573 unsigned Opc = MI.getOpcode();
6574
6575 bool NeedClampOperand = false;
6576 if (TII->pseudoToMCOpcode(Opc) == -1) {
6578 NeedClampOperand = true;
6579 }
6580
6581 auto I = BuildMI(*BB, MI, DL, TII->get(Opc), MI.getOperand(0).getReg());
6582 if (TII->isVOP3(*I)) {
6583 I.addReg(TRI->getVCC(), RegState::Define);
6584 }
6585 I.add(MI.getOperand(1)).add(MI.getOperand(2));
6586 if (NeedClampOperand)
6587 I.addImm(0); // clamp bit for e64 encoding
6588
6589 TII->legalizeOperands(*I);
6590
6591 MI.eraseFromParent();
6592 return BB;
6593 }
6594 case AMDGPU::V_ADDC_U32_e32:
6595 case AMDGPU::V_SUBB_U32_e32:
6596 case AMDGPU::V_SUBBREV_U32_e32:
6597 // These instructions have an implicit use of vcc which counts towards the
6598 // constant bus limit.
6599 TII->legalizeOperands(MI);
6600 return BB;
6601 case AMDGPU::DS_GWS_INIT:
6602 case AMDGPU::DS_GWS_SEMA_BR:
6603 case AMDGPU::DS_GWS_BARRIER:
6604 case AMDGPU::DS_GWS_SEMA_V:
6605 case AMDGPU::DS_GWS_SEMA_P:
6606 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
6607 // A s_waitcnt 0 is required to be the instruction immediately following.
6608 if (getSubtarget()->hasGWSAutoReplay()) {
6610 return BB;
6611 }
6612
6613 return emitGWSMemViolTestLoop(MI, BB);
6614 case AMDGPU::S_SETREG_B32: {
6615 // Try to optimize cases that only set the denormal mode or rounding mode.
6616 //
6617 // If the s_setreg_b32 fully sets all of the bits in the rounding mode or
6618 // denormal mode to a constant, we can use s_round_mode or s_denorm_mode
6619 // instead.
6620 //
6621 // FIXME: This could be predicates on the immediate, but tablegen doesn't
6622 // allow you to have a no side effect instruction in the output of a
6623 // sideeffecting pattern.
6624 auto [ID, Offset, Width] =
6625 AMDGPU::Hwreg::HwregEncoding::decode(MI.getOperand(1).getImm());
6627 return BB;
6628
6629 const unsigned WidthMask = maskTrailingOnes<unsigned>(Width);
6630 const unsigned SetMask = WidthMask << Offset;
6631
6632 if (getSubtarget()->hasDenormModeInst()) {
6633 unsigned SetDenormOp = 0;
6634 unsigned SetRoundOp = 0;
6635
6636 // The dedicated instructions can only set the whole denorm or round mode
6637 // at once, not a subset of bits in either.
6638 if (SetMask ==
6640 // If this fully sets both the round and denorm mode, emit the two
6641 // dedicated instructions for these.
6642 SetRoundOp = AMDGPU::S_ROUND_MODE;
6643 SetDenormOp = AMDGPU::S_DENORM_MODE;
6644 } else if (SetMask == AMDGPU::Hwreg::FP_ROUND_MASK) {
6645 SetRoundOp = AMDGPU::S_ROUND_MODE;
6646 } else if (SetMask == AMDGPU::Hwreg::FP_DENORM_MASK) {
6647 SetDenormOp = AMDGPU::S_DENORM_MODE;
6648 }
6649
6650 if (SetRoundOp || SetDenormOp) {
6651 MachineInstr *Def = MRI.getVRegDef(MI.getOperand(0).getReg());
6652 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
6653 unsigned ImmVal = Def->getOperand(1).getImm();
6654 if (SetRoundOp) {
6655 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetRoundOp))
6656 .addImm(ImmVal & 0xf);
6657
6658 // If we also have the denorm mode, get just the denorm mode bits.
6659 ImmVal >>= 4;
6660 }
6661
6662 if (SetDenormOp) {
6663 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetDenormOp))
6664 .addImm(ImmVal & 0xf);
6665 }
6666
6667 MI.eraseFromParent();
6668 return BB;
6669 }
6670 }
6671 }
6672
6673 // If only FP bits are touched, used the no side effects pseudo.
6674 if ((SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK |
6675 AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask)
6676 MI.setDesc(TII->get(AMDGPU::S_SETREG_B32_mode));
6677
6678 return BB;
6679 }
6680 case AMDGPU::S_INVERSE_BALLOT_U32:
6681 case AMDGPU::S_INVERSE_BALLOT_U64:
6682 // These opcodes only exist to let SIFixSGPRCopies insert a readfirstlane if
6683 // necessary. After that they are equivalent to a COPY.
6684 MI.setDesc(TII->get(AMDGPU::COPY));
6685 return BB;
6686 case AMDGPU::ENDPGM_TRAP: {
6687 if (BB->succ_empty() && std::next(MI.getIterator()) == BB->end()) {
6688 MI.setDesc(TII->get(AMDGPU::S_ENDPGM));
6689 MI.addOperand(MachineOperand::CreateImm(0));
6690 return BB;
6691 }
6692
6693 // We need a block split to make the real endpgm a terminator. We also don't
6694 // want to break phis in successor blocks, so we can't just delete to the
6695 // end of the block.
6696
6697 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
6699 MF->push_back(TrapBB);
6700 // clang-format off
6701 BuildMI(*TrapBB, TrapBB->end(), DL, TII->get(AMDGPU::S_ENDPGM))
6702 .addImm(0);
6703 BuildMI(*BB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
6704 .addMBB(TrapBB);
6705 // clang-format on
6706
6707 BB->addSuccessor(TrapBB);
6708 MI.eraseFromParent();
6709 return SplitBB;
6710 }
6711 case AMDGPU::SIMULATED_TRAP: {
6712 assert(Subtarget->hasPrivEnabledTrap2NopBug());
6713 MachineBasicBlock *SplitBB =
6714 TII->insertSimulatedTrap(MRI, *BB, MI, MI.getDebugLoc());
6715 MI.eraseFromParent();
6716 return SplitBB;
6717 }
6718 case AMDGPU::SI_TCRETURN_GFX_WholeWave:
6719 case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN: {
6721
6722 // During ISel, it's difficult to propagate the original EXEC mask to use as
6723 // an input to SI_WHOLE_WAVE_FUNC_RETURN. Set it up here instead.
6724 MachineInstr *Setup = TII->getWholeWaveFunctionSetup(*BB->getParent());
6725 assert(Setup && "Couldn't find SI_SETUP_WHOLE_WAVE_FUNC");
6726 Register OriginalExec = Setup->getOperand(0).getReg();
6727 MF->getRegInfo().clearKillFlags(OriginalExec);
6728 MI.getOperand(0).setReg(OriginalExec);
6729 return BB;
6730 }
6731 default:
6732 if (TII->isImage(MI) || TII->isMUBUF(MI)) {
6733 if (!MI.mayStore())
6735 return BB;
6736 }
6738 }
6739}
6740
6742 // This currently forces unfolding various combinations of fsub into fma with
6743 // free fneg'd operands. As long as we have fast FMA (controlled by
6744 // isFMAFasterThanFMulAndFAdd), we should perform these.
6745
6746 // When fma is quarter rate, for f64 where add / sub are at best half rate,
6747 // most of these combines appear to be cycle neutral but save on instruction
6748 // count / code size.
6749 return true;
6750}
6751
6753
6755 EVT VT) const {
6756 if (!VT.isVector()) {
6757 return MVT::i1;
6758 }
6759 return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
6760}
6761
6763 // TODO: Should i16 be used always if legal? For now it would force VALU
6764 // shifts.
6765 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
6766}
6767
6769 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
6770 ? Ty.changeElementSize(16)
6771 : Ty.changeElementSize(32);
6772}
6773
6774// Answering this is somewhat tricky and depends on the specific device which
6775// have different rates for fma or all f64 operations.
6776//
6777// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
6778// regardless of which device (although the number of cycles differs between
6779// devices), so it is always profitable for f64.
6780//
6781// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
6782// only on full rate devices. Normally, we should prefer selecting v_mad_f32
6783// which we can always do even without fused FP ops since it returns the same
6784// result as the separate operations and since it is always full
6785// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
6786// however does not support denormals, so we do report fma as faster if we have
6787// a fast fma device and require denormals.
6788//
6790 EVT VT) const {
6791 VT = VT.getScalarType();
6792
6793 switch (VT.getSimpleVT().SimpleTy) {
6794 case MVT::f32: {
6795 // If mad is not available this depends only on if f32 fma is full rate.
6796 if (!Subtarget->hasMadMacF32Insts())
6797 return Subtarget->hasFastFMAF32();
6798
6799 // Otherwise f32 mad is always full rate and returns the same result as
6800 // the separate operations so should be preferred over fma.
6801 // However does not support denormals.
6803 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
6804
6805 // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
6806 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
6807 }
6808 case MVT::f64:
6809 return true;
6810 case MVT::f16:
6811 case MVT::bf16:
6812 return Subtarget->has16BitInsts() && !denormalModeIsFlushAllF64F16(MF);
6813 default:
6814 break;
6815 }
6816
6817 return false;
6818}
6819
6821 LLT Ty) const {
6822 switch (Ty.getScalarSizeInBits()) {
6823 case 16:
6824 return isFMAFasterThanFMulAndFAdd(MF, MVT::f16);
6825 case 32:
6826 return isFMAFasterThanFMulAndFAdd(MF, MVT::f32);
6827 case 64:
6828 return isFMAFasterThanFMulAndFAdd(MF, MVT::f64);
6829 default:
6830 break;
6831 }
6832
6833 return false;
6834}
6835
6837 if (!Ty.isScalar())
6838 return false;
6839
6840 if (Ty.getScalarSizeInBits() == 16)
6841 return Subtarget->hasMadF16() && denormalModeIsFlushAllF64F16(*MI.getMF());
6842 if (Ty.getScalarSizeInBits() == 32)
6843 return Subtarget->hasMadMacF32Insts() &&
6844 denormalModeIsFlushAllF32(*MI.getMF());
6845
6846 return false;
6847}
6848
6850 const SDNode *N) const {
6851 // TODO: Check future ftz flag
6852 // v_mad_f32/v_mac_f32 do not support denormals.
6853 EVT VT = N->getValueType(0);
6854 if (VT == MVT::f32)
6855 return Subtarget->hasMadMacF32Insts() &&
6857 if (VT == MVT::f16) {
6858 return Subtarget->hasMadF16() &&
6860 }
6861
6862 return false;
6863}
6864
6865//===----------------------------------------------------------------------===//
6866// Custom DAG Lowering Operations
6867//===----------------------------------------------------------------------===//
6868
6869// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
6870// wider vector type is legal.
6872 SelectionDAG &DAG) const {
6873 unsigned Opc = Op.getOpcode();
6874 EVT VT = Op.getValueType();
6875 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
6876 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
6877 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
6878 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6879 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
6880 VT == MVT::v32bf16);
6881
6882 auto [Lo, Hi] = DAG.SplitVectorOperand(Op.getNode(), 0);
6883
6884 SDLoc SL(Op);
6885 SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo, Op->getFlags());
6886 SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi, Op->getFlags());
6887
6888 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
6889}
6890
6891// Enable lowering of ROTR for vxi32 types. This is a workaround for a
6892// regression whereby extra unnecessary instructions were added to codegen
6893// for rotr operations, casued by legalising v2i32 or. This resulted in extra
6894// instructions to extract the result from the vector.
6896 [[maybe_unused]] EVT VT = Op.getValueType();
6897
6898 assert((VT == MVT::v2i32 || VT == MVT::v4i32 || VT == MVT::v8i32 ||
6899 VT == MVT::v16i32) &&
6900 "Unexpected ValueType.");
6901
6902 return DAG.UnrollVectorOp(Op.getNode());
6903}
6904
6905// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
6906// wider vector type is legal.
6908 SelectionDAG &DAG) const {
6909 unsigned Opc = Op.getOpcode();
6910 EVT VT = Op.getValueType();
6911 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
6912 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
6913 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
6914 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6915 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
6916 VT == MVT::v32bf16);
6917
6918 auto [Lo0, Hi0] = DAG.SplitVectorOperand(Op.getNode(), 0);
6919 auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
6920
6921 SDLoc SL(Op);
6922
6923 SDValue OpLo =
6924 DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1, Op->getFlags());
6925 SDValue OpHi =
6926 DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1, Op->getFlags());
6927
6928 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
6929}
6930
6932 SelectionDAG &DAG) const {
6933 unsigned Opc = Op.getOpcode();
6934 EVT VT = Op.getValueType();
6935 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
6936 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
6937 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6938 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
6939 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
6940 VT == MVT::v32bf16);
6941
6942 SDValue Op0 = Op.getOperand(0);
6943 auto [Lo0, Hi0] = Op0.getValueType().isVector()
6944 ? DAG.SplitVectorOperand(Op.getNode(), 0)
6945 : std::pair(Op0, Op0);
6946
6947 auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
6948 auto [Lo2, Hi2] = DAG.SplitVectorOperand(Op.getNode(), 2);
6949
6950 SDLoc SL(Op);
6951 auto ResVT = DAG.GetSplitDestVTs(VT);
6952
6953 SDValue OpLo =
6954 DAG.getNode(Opc, SL, ResVT.first, Lo0, Lo1, Lo2, Op->getFlags());
6955 SDValue OpHi =
6956 DAG.getNode(Opc, SL, ResVT.second, Hi0, Hi1, Hi2, Op->getFlags());
6957
6958 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
6959}
6960
6962 switch (Op.getOpcode()) {
6963 default:
6965 case ISD::BRCOND:
6966 return LowerBRCOND(Op, DAG);
6967 case ISD::RETURNADDR:
6968 return LowerRETURNADDR(Op, DAG);
6969 case ISD::LOAD: {
6970 SDValue Result = LowerLOAD(Op, DAG);
6971 assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) &&
6972 "Load should return a value and a chain");
6973 return Result;
6974 }
6975 case ISD::FSQRT: {
6976 EVT VT = Op.getValueType();
6977 if (VT == MVT::f32)
6978 return lowerFSQRTF32(Op, DAG);
6979 if (VT == MVT::f64)
6980 return lowerFSQRTF64(Op, DAG);
6981 return SDValue();
6982 }
6983 case ISD::FSIN:
6984 case ISD::FCOS:
6985 return LowerTrig(Op, DAG);
6986 case ISD::SELECT:
6987 return LowerSELECT(Op, DAG);
6988 case ISD::FDIV:
6989 return LowerFDIV(Op, DAG);
6990 case ISD::FFREXP:
6991 return LowerFFREXP(Op, DAG);
6993 return LowerATOMIC_CMP_SWAP(Op, DAG);
6994 case ISD::STORE:
6995 return LowerSTORE(Op, DAG);
6996 case ISD::GlobalAddress: {
6999 return LowerGlobalAddress(MFI, Op, DAG);
7000 }
7002 return LowerExternalSymbol(Op, DAG);
7004 return LowerINTRINSIC_WO_CHAIN(Op, DAG);
7006 return LowerINTRINSIC_W_CHAIN(Op, DAG);
7008 return LowerINTRINSIC_VOID(Op, DAG);
7009 case ISD::ADDRSPACECAST:
7010 return lowerADDRSPACECAST(Op, DAG);
7012 return lowerINSERT_SUBVECTOR(Op, DAG);
7014 return lowerINSERT_VECTOR_ELT(Op, DAG);
7016 return lowerEXTRACT_VECTOR_ELT(Op, DAG);
7018 return lowerVECTOR_SHUFFLE(Op, DAG);
7020 return lowerSCALAR_TO_VECTOR(Op, DAG);
7021 case ISD::BUILD_VECTOR:
7022 return lowerBUILD_VECTOR(Op, DAG);
7023 case ISD::FP_ROUND:
7025 return lowerFP_ROUND(Op, DAG);
7026 case ISD::TRAP:
7027 return lowerTRAP(Op, DAG);
7028 case ISD::DEBUGTRAP:
7029 return lowerDEBUGTRAP(Op, DAG);
7030 case ISD::ABS:
7031 case ISD::FABS:
7032 case ISD::FNEG:
7033 case ISD::FCANONICALIZE:
7034 case ISD::BSWAP:
7035 return splitUnaryVectorOp(Op, DAG);
7036 case ISD::FMINNUM:
7037 case ISD::FMAXNUM:
7038 return lowerFMINNUM_FMAXNUM(Op, DAG);
7039 case ISD::FMINIMUMNUM:
7040 case ISD::FMAXIMUMNUM:
7041 return lowerFMINIMUMNUM_FMAXIMUMNUM(Op, DAG);
7042 case ISD::FMINIMUM:
7043 case ISD::FMAXIMUM:
7044 return lowerFMINIMUM_FMAXIMUM(Op, DAG);
7045 case ISD::FLDEXP:
7046 case ISD::STRICT_FLDEXP:
7047 return lowerFLDEXP(Op, DAG);
7048 case ISD::FMA:
7049 return splitTernaryVectorOp(Op, DAG);
7050 case ISD::FP_TO_SINT:
7051 case ISD::FP_TO_UINT:
7052 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11 &&
7053 Op.getValueType() == MVT::i16 &&
7054 Op.getOperand(0).getValueType() == MVT::f32) {
7055 // Make f32->i16 legal so we can select V_CVT_PK_[IU]16_F32.
7056 return Op;
7057 }
7058 return LowerFP_TO_INT(Op, DAG);
7059 case ISD::SHL:
7060 case ISD::SRA:
7061 case ISD::SRL:
7062 case ISD::ADD:
7063 case ISD::SUB:
7064 case ISD::SMIN:
7065 case ISD::SMAX:
7066 case ISD::UMIN:
7067 case ISD::UMAX:
7068 case ISD::FADD:
7069 case ISD::FMUL:
7070 case ISD::FMINNUM_IEEE:
7071 case ISD::FMAXNUM_IEEE:
7072 case ISD::UADDSAT:
7073 case ISD::USUBSAT:
7074 case ISD::SADDSAT:
7075 case ISD::SSUBSAT:
7076 return splitBinaryVectorOp(Op, DAG);
7077 case ISD::FCOPYSIGN:
7078 return lowerFCOPYSIGN(Op, DAG);
7079 case ISD::MUL:
7080 return lowerMUL(Op, DAG);
7081 case ISD::SMULO:
7082 case ISD::UMULO:
7083 return lowerXMULO(Op, DAG);
7084 case ISD::SMUL_LOHI:
7085 case ISD::UMUL_LOHI:
7086 return lowerXMUL_LOHI(Op, DAG);
7088 return LowerDYNAMIC_STACKALLOC(Op, DAG);
7089 case ISD::STACKSAVE:
7090 return LowerSTACKSAVE(Op, DAG);
7091 case ISD::GET_ROUNDING:
7092 return lowerGET_ROUNDING(Op, DAG);
7093 case ISD::SET_ROUNDING:
7094 return lowerSET_ROUNDING(Op, DAG);
7095 case ISD::PREFETCH:
7096 return lowerPREFETCH(Op, DAG);
7097 case ISD::FP_EXTEND:
7099 return lowerFP_EXTEND(Op, DAG);
7100 case ISD::GET_FPENV:
7101 return lowerGET_FPENV(Op, DAG);
7102 case ISD::SET_FPENV:
7103 return lowerSET_FPENV(Op, DAG);
7104 case ISD::ROTR:
7105 return lowerROTR(Op, DAG);
7106 }
7107 return SDValue();
7108}
7109
7110// Used for D16: Casts the result of an instruction into the right vector,
7111// packs values if loads return unpacked values.
7113 const SDLoc &DL, SelectionDAG &DAG,
7114 bool Unpacked) {
7115 if (!LoadVT.isVector())
7116 return Result;
7117
7118 // Cast back to the original packed type or to a larger type that is a
7119 // multiple of 32 bit for D16. Widening the return type is a required for
7120 // legalization.
7121 EVT FittingLoadVT = LoadVT;
7122 if ((LoadVT.getVectorNumElements() % 2) == 1) {
7123 FittingLoadVT =
7125 LoadVT.getVectorNumElements() + 1);
7126 }
7127
7128 if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
7129 // Truncate to v2i16/v4i16.
7130 EVT IntLoadVT = FittingLoadVT.changeTypeToInteger();
7131
7132 // Workaround legalizer not scalarizing truncate after vector op
7133 // legalization but not creating intermediate vector trunc.
7135 DAG.ExtractVectorElements(Result, Elts);
7136 for (SDValue &Elt : Elts)
7137 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
7138
7139 // Pad illegal v1i16/v3fi6 to v4i16
7140 if ((LoadVT.getVectorNumElements() % 2) == 1)
7141 Elts.push_back(DAG.getPOISON(MVT::i16));
7142
7143 Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
7144
7145 // Bitcast to original type (v2f16/v4f16).
7146 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
7147 }
7148
7149 // Cast back to the original packed type.
7150 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
7151}
7152
7153SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode, MemSDNode *M,
7154 SelectionDAG &DAG,
7156 bool IsIntrinsic) const {
7157 SDLoc DL(M);
7158
7159 bool Unpacked = Subtarget->hasUnpackedD16VMem();
7160 EVT LoadVT = M->getValueType(0);
7161
7162 EVT EquivLoadVT = LoadVT;
7163 if (LoadVT.isVector()) {
7164 if (Unpacked) {
7165 EquivLoadVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
7166 LoadVT.getVectorNumElements());
7167 } else if ((LoadVT.getVectorNumElements() % 2) == 1) {
7168 // Widen v3f16 to legal type
7169 EquivLoadVT =
7171 LoadVT.getVectorNumElements() + 1);
7172 }
7173 }
7174
7175 // Change from v4f16/v2f16 to EquivLoadVT.
7176 SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
7177
7179 IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL, VTList, Ops,
7180 M->getMemoryVT(), M->getMemOperand());
7181
7182 SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
7183
7184 return DAG.getMergeValues({Adjusted, Load.getValue(1)}, DL);
7185}
7186
7187SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat,
7188 SelectionDAG &DAG,
7189 ArrayRef<SDValue> Ops) const {
7190 SDLoc DL(M);
7191 EVT LoadVT = M->getValueType(0);
7192 EVT EltType = LoadVT.getScalarType();
7193 EVT IntVT = LoadVT.changeTypeToInteger();
7194
7195 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
7196
7197 assert(M->getNumValues() == 2 || M->getNumValues() == 3);
7198 bool IsTFE = M->getNumValues() == 3;
7199
7200 unsigned Opc = IsFormat ? (IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE
7201 : AMDGPUISD::BUFFER_LOAD_FORMAT)
7202 : IsTFE ? AMDGPUISD::BUFFER_LOAD_TFE
7203 : AMDGPUISD::BUFFER_LOAD;
7204
7205 if (IsD16) {
7206 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops);
7207 }
7208
7209 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
7210 if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32)
7211 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M->getMemOperand(),
7212 IsTFE);
7213
7214 if (isTypeLegal(LoadVT)) {
7215 return getMemIntrinsicNode(Opc, DL, M->getVTList(), Ops, IntVT,
7216 M->getMemOperand(), DAG);
7217 }
7218
7219 EVT CastVT = getEquivalentMemType(*DAG.getContext(), LoadVT);
7220 SDVTList VTList = DAG.getVTList(CastVT, MVT::Other);
7221 SDValue MemNode = getMemIntrinsicNode(Opc, DL, VTList, Ops, CastVT,
7222 M->getMemOperand(), DAG);
7223 return DAG.getMergeValues(
7224 {DAG.getNode(ISD::BITCAST, DL, LoadVT, MemNode), MemNode.getValue(1)},
7225 DL);
7226}
7227
7229 SelectionDAG &DAG) {
7230 EVT VT = N->getValueType(0);
7231 unsigned CondCode = N->getConstantOperandVal(3);
7232 if (!ICmpInst::isIntPredicate(static_cast<ICmpInst::Predicate>(CondCode)))
7233 return DAG.getPOISON(VT);
7234
7235 ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
7236
7237 SDValue LHS = N->getOperand(1);
7238 SDValue RHS = N->getOperand(2);
7239
7240 SDLoc DL(N);
7241
7242 EVT CmpVT = LHS.getValueType();
7243 if (CmpVT == MVT::i16 && !TLI.isTypeLegal(MVT::i16)) {
7244 unsigned PromoteOp =
7246 LHS = DAG.getNode(PromoteOp, DL, MVT::i32, LHS);
7247 RHS = DAG.getNode(PromoteOp, DL, MVT::i32, RHS);
7248 }
7249
7250 ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
7251
7252 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
7253 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
7254
7255 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, DL, CCVT, LHS, RHS,
7256 DAG.getCondCode(CCOpcode));
7257 if (VT.bitsEq(CCVT))
7258 return SetCC;
7259 return DAG.getZExtOrTrunc(SetCC, DL, VT);
7260}
7261
7263 SelectionDAG &DAG) {
7264 EVT VT = N->getValueType(0);
7265
7266 unsigned CondCode = N->getConstantOperandVal(3);
7267 if (!FCmpInst::isFPPredicate(static_cast<FCmpInst::Predicate>(CondCode)))
7268 return DAG.getPOISON(VT);
7269
7270 SDValue Src0 = N->getOperand(1);
7271 SDValue Src1 = N->getOperand(2);
7272 EVT CmpVT = Src0.getValueType();
7273 SDLoc SL(N);
7274
7275 if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) {
7276 Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
7277 Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
7278 }
7279
7280 FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
7281 ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
7282 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
7283 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
7284 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, SL, CCVT, Src0, Src1,
7285 DAG.getCondCode(CCOpcode));
7286 if (VT.bitsEq(CCVT))
7287 return SetCC;
7288 return DAG.getZExtOrTrunc(SetCC, SL, VT);
7289}
7290
7292 SelectionDAG &DAG) {
7293 EVT VT = N->getValueType(0);
7294 SDValue Src = N->getOperand(1);
7295 SDLoc SL(N);
7296
7297 if (Src.getOpcode() == ISD::SETCC) {
7298 SDValue Op0 = Src.getOperand(0);
7299 SDValue Op1 = Src.getOperand(1);
7300 // Need to expand bfloat to float for comparison (setcc).
7301 if (Op0.getValueType() == MVT::bf16) {
7302 Op0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op0);
7303 Op1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op1);
7304 }
7305 // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...)
7306 return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Op0, Op1, Src.getOperand(2));
7307 }
7308 if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Src)) {
7309 // (ballot 0) -> 0
7310 if (Arg->isZero())
7311 return DAG.getConstant(0, SL, VT);
7312
7313 // (ballot 1) -> EXEC/EXEC_LO
7314 if (Arg->isOne()) {
7315 Register Exec;
7316 if (VT.getScalarSizeInBits() == 32)
7317 Exec = AMDGPU::EXEC_LO;
7318 else if (VT.getScalarSizeInBits() == 64)
7319 Exec = AMDGPU::EXEC;
7320 else
7321 return SDValue();
7322
7323 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, Exec, VT);
7324 }
7325 }
7326
7327 // (ballot (i1 $src)) -> (AMDGPUISD::SETCC (i32 (zext $src)) (i32 0)
7328 // ISD::SETNE)
7329 return DAG.getNode(
7330 AMDGPUISD::SETCC, SL, VT, DAG.getZExtOrTrunc(Src, SL, MVT::i32),
7331 DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE));
7332}
7333
7335 SelectionDAG &DAG) {
7336 EVT VT = N->getValueType(0);
7337 unsigned ValSize = VT.getSizeInBits();
7338 unsigned IID = N->getConstantOperandVal(0);
7339 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
7340 IID == Intrinsic::amdgcn_permlanex16;
7341 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
7342 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
7343 SDLoc SL(N);
7344 MVT IntVT = MVT::getIntegerVT(ValSize);
7345 const GCNSubtarget *ST = TLI.getSubtarget();
7346 unsigned SplitSize = 32;
7347 if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
7348 ST->hasDPALU_DPP() &&
7349 AMDGPU::isLegalDPALU_DPPControl(*ST, N->getConstantOperandVal(3)))
7350 SplitSize = 64;
7351
7352 auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1,
7353 SDValue Src2, MVT ValT) -> SDValue {
7354 SmallVector<SDValue, 8> Operands;
7355 switch (IID) {
7356 case Intrinsic::amdgcn_permlane16:
7357 case Intrinsic::amdgcn_permlanex16:
7358 case Intrinsic::amdgcn_update_dpp:
7359 Operands.push_back(N->getOperand(6));
7360 Operands.push_back(N->getOperand(5));
7361 Operands.push_back(N->getOperand(4));
7362 [[fallthrough]];
7363 case Intrinsic::amdgcn_writelane:
7364 Operands.push_back(Src2);
7365 [[fallthrough]];
7366 case Intrinsic::amdgcn_readlane:
7367 case Intrinsic::amdgcn_set_inactive:
7368 case Intrinsic::amdgcn_set_inactive_chain_arg:
7369 case Intrinsic::amdgcn_mov_dpp8:
7370 Operands.push_back(Src1);
7371 [[fallthrough]];
7372 case Intrinsic::amdgcn_readfirstlane:
7373 case Intrinsic::amdgcn_permlane64:
7374 Operands.push_back(Src0);
7375 break;
7376 default:
7377 llvm_unreachable("unhandled lane op");
7378 }
7379
7380 Operands.push_back(DAG.getTargetConstant(IID, SL, MVT::i32));
7381 std::reverse(Operands.begin(), Operands.end());
7382
7383 if (SDNode *GL = N->getGluedNode()) {
7384 assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
7385 GL = GL->getOperand(0).getNode();
7386 Operands.push_back(DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
7387 SDValue(GL, 0)));
7388 }
7389
7390 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, ValT, Operands);
7391 };
7392
7393 SDValue Src0 = N->getOperand(1);
7394 SDValue Src1, Src2;
7395 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
7396 IID == Intrinsic::amdgcn_mov_dpp8 ||
7397 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7398 Src1 = N->getOperand(2);
7399 if (IID == Intrinsic::amdgcn_writelane ||
7400 IID == Intrinsic::amdgcn_update_dpp || IsPermLane16)
7401 Src2 = N->getOperand(3);
7402 }
7403
7404 if (ValSize == SplitSize) {
7405 // Already legal
7406 return SDValue();
7407 }
7408
7409 if (ValSize < 32) {
7410 bool IsFloat = VT.isFloatingPoint();
7411 Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0,
7412 SL, MVT::i32);
7413
7414 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7415 Src1 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src1) : Src1,
7416 SL, MVT::i32);
7417 }
7418
7419 if (IID == Intrinsic::amdgcn_writelane) {
7420 Src2 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src2) : Src2,
7421 SL, MVT::i32);
7422 }
7423
7424 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
7425 SDValue Trunc = DAG.getAnyExtOrTrunc(LaneOp, SL, IntVT);
7426 return IsFloat ? DAG.getBitcast(VT, Trunc) : Trunc;
7427 }
7428
7429 if (ValSize % SplitSize != 0)
7430 return SDValue();
7431
7432 auto unrollLaneOp = [&DAG, &SL](SDNode *N) -> SDValue {
7433 EVT VT = N->getValueType(0);
7434 unsigned NE = VT.getVectorNumElements();
7435 EVT EltVT = VT.getVectorElementType();
7437 unsigned NumOperands = N->getNumOperands();
7438 SmallVector<SDValue, 4> Operands(NumOperands);
7439 SDNode *GL = N->getGluedNode();
7440
7441 // only handle convergencectrl_glue
7443
7444 for (unsigned i = 0; i != NE; ++i) {
7445 for (unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
7446 ++j) {
7447 SDValue Operand = N->getOperand(j);
7448 EVT OperandVT = Operand.getValueType();
7449 if (OperandVT.isVector()) {
7450 // A vector operand; extract a single element.
7451 EVT OperandEltVT = OperandVT.getVectorElementType();
7452 Operands[j] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, OperandEltVT,
7453 Operand, DAG.getVectorIdxConstant(i, SL));
7454 } else {
7455 // A scalar operand; just use it as is.
7456 Operands[j] = Operand;
7457 }
7458 }
7459
7460 if (GL)
7461 Operands[NumOperands - 1] =
7462 DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
7463 SDValue(GL->getOperand(0).getNode(), 0));
7464
7465 Scalars.push_back(DAG.getNode(N->getOpcode(), SL, EltVT, Operands));
7466 }
7467
7468 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NE);
7469 return DAG.getBuildVector(VecVT, SL, Scalars);
7470 };
7471
7472 if (VT.isVector()) {
7473 switch (MVT::SimpleValueType EltTy =
7475 case MVT::i32:
7476 case MVT::f32:
7477 if (SplitSize == 32) {
7478 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VT.getSimpleVT());
7479 return unrollLaneOp(LaneOp.getNode());
7480 }
7481 [[fallthrough]];
7482 case MVT::i16:
7483 case MVT::f16:
7484 case MVT::bf16: {
7485 unsigned SubVecNumElt =
7486 SplitSize / VT.getVectorElementType().getSizeInBits();
7487 MVT SubVecVT = MVT::getVectorVT(EltTy, SubVecNumElt);
7489 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
7490 for (unsigned i = 0, EltIdx = 0; i < ValSize / SplitSize; i++) {
7491 Src0SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0,
7492 DAG.getConstant(EltIdx, SL, MVT::i32));
7493
7494 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive ||
7495 IsPermLane16)
7496 Src1SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src1,
7497 DAG.getConstant(EltIdx, SL, MVT::i32));
7498
7499 if (IID == Intrinsic::amdgcn_writelane)
7500 Src2SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src2,
7501 DAG.getConstant(EltIdx, SL, MVT::i32));
7502
7503 Pieces.push_back(
7504 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16
7505 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
7506 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
7507 EltIdx += SubVecNumElt;
7508 }
7509 return DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, Pieces);
7510 }
7511 default:
7512 // Handle all other cases by bitcasting to i32 vectors
7513 break;
7514 }
7515 }
7516
7517 MVT VecVT =
7518 MVT::getVectorVT(MVT::getIntegerVT(SplitSize), ValSize / SplitSize);
7519 Src0 = DAG.getBitcast(VecVT, Src0);
7520
7521 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
7522 Src1 = DAG.getBitcast(VecVT, Src1);
7523
7524 if (IID == Intrinsic::amdgcn_writelane)
7525 Src2 = DAG.getBitcast(VecVT, Src2);
7526
7527 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
7528 SDValue UnrolledLaneOp = unrollLaneOp(LaneOp.getNode());
7529 return DAG.getBitcast(VT, UnrolledLaneOp);
7530}
7531
7533 SelectionDAG &DAG) {
7534 EVT VT = N->getValueType(0);
7535
7536 if (VT.getSizeInBits() != 32)
7537 return SDValue();
7538
7539 SDLoc SL(N);
7540
7541 SDValue Value = N->getOperand(1);
7542 SDValue Index = N->getOperand(2);
7543
7544 // ds_bpermute requires index to be multiplied by 4
7545 SDValue ShiftAmount = DAG.getShiftAmountConstant(2, MVT::i32, SL);
7546 SDValue ShiftedIndex =
7547 DAG.getNode(ISD::SHL, SL, Index.getValueType(), Index, ShiftAmount);
7548
7549 // Intrinsics will require i32 to operate on
7550 SDValue ValueI32 = DAG.getBitcast(MVT::i32, Value);
7551
7552 auto MakeIntrinsic = [&DAG, &SL](unsigned IID, MVT RetVT,
7553 SmallVector<SDValue> IntrinArgs) -> SDValue {
7554 SmallVector<SDValue> Operands(1);
7555 Operands[0] = DAG.getTargetConstant(IID, SL, MVT::i32);
7556 Operands.append(IntrinArgs);
7557 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, RetVT, Operands);
7558 };
7559
7560 // If we can bpermute across the whole wave, then just do that
7562 SDValue BPermute = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute, MVT::i32,
7563 {ShiftedIndex, ValueI32});
7564 return DAG.getBitcast(VT, BPermute);
7565 }
7566
7567 assert(TLI.getSubtarget()->isWave64());
7568
7569 // Otherwise, we need to make use of whole wave mode
7570 SDValue PoisonVal = DAG.getPOISON(ValueI32->getValueType(0));
7571
7572 // Set inactive lanes to poison
7573 SDValue WWMValue = MakeIntrinsic(Intrinsic::amdgcn_set_inactive, MVT::i32,
7574 {ValueI32, PoisonVal});
7575 SDValue WWMIndex = MakeIntrinsic(Intrinsic::amdgcn_set_inactive, MVT::i32,
7576 {ShiftedIndex, PoisonVal});
7577
7578 SDValue Swapped =
7579 MakeIntrinsic(Intrinsic::amdgcn_permlane64, MVT::i32, {WWMValue});
7580
7581 // Get permutation of each half, then we'll select which one to use
7582 SDValue BPermSameHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute, MVT::i32,
7583 {WWMIndex, WWMValue});
7584 SDValue BPermOtherHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute,
7585 MVT::i32, {WWMIndex, Swapped});
7586 SDValue BPermOtherHalfWWM =
7587 MakeIntrinsic(Intrinsic::amdgcn_wwm, MVT::i32, {BPermOtherHalf});
7588
7589 // Select which side to take the permute from
7590 SDValue ThreadIDMask = DAG.getAllOnesConstant(SL, MVT::i32);
7591 // We can get away with only using mbcnt_lo here since we're only
7592 // trying to detect which side of 32 each lane is on, and mbcnt_lo
7593 // returns 32 for lanes 32-63.
7594 SDValue ThreadID =
7595 MakeIntrinsic(Intrinsic::amdgcn_mbcnt_lo, MVT::i32,
7596 {ThreadIDMask, DAG.getTargetConstant(0, SL, MVT::i32)});
7597
7598 SDValue SameOrOtherHalf =
7599 DAG.getNode(ISD::AND, SL, MVT::i32,
7600 DAG.getNode(ISD::XOR, SL, MVT::i32, ThreadID, Index),
7601 DAG.getTargetConstant(32, SL, MVT::i32));
7602 SDValue UseSameHalf =
7603 DAG.getSetCC(SL, MVT::i1, SameOrOtherHalf,
7604 DAG.getConstant(0, SL, MVT::i32), ISD::SETEQ);
7605 SDValue Result = DAG.getSelect(SL, MVT::i32, UseSameHalf, BPermSameHalf,
7606 BPermOtherHalfWWM);
7607 return DAG.getBitcast(VT, Result);
7608}
7609
7612 SelectionDAG &DAG) const {
7613 switch (N->getOpcode()) {
7615 if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
7616 Results.push_back(Res);
7617 return;
7618 }
7620 if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
7621 Results.push_back(Res);
7622 return;
7623 }
7625 unsigned IID = N->getConstantOperandVal(0);
7626 switch (IID) {
7627 case Intrinsic::amdgcn_make_buffer_rsrc:
7628 Results.push_back(lowerPointerAsRsrcIntrin(N, DAG));
7629 return;
7630 case Intrinsic::amdgcn_cvt_pkrtz: {
7631 SDValue Src0 = N->getOperand(1);
7632 SDValue Src1 = N->getOperand(2);
7633 SDLoc SL(N);
7634 SDValue Cvt =
7635 DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32, Src0, Src1);
7636 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
7637 return;
7638 }
7639 case Intrinsic::amdgcn_cvt_pknorm_i16:
7640 case Intrinsic::amdgcn_cvt_pknorm_u16:
7641 case Intrinsic::amdgcn_cvt_pk_i16:
7642 case Intrinsic::amdgcn_cvt_pk_u16: {
7643 SDValue Src0 = N->getOperand(1);
7644 SDValue Src1 = N->getOperand(2);
7645 SDLoc SL(N);
7646 unsigned Opcode;
7647
7648 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
7649 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
7650 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
7651 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
7652 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
7653 Opcode = AMDGPUISD::CVT_PK_I16_I32;
7654 else
7655 Opcode = AMDGPUISD::CVT_PK_U16_U32;
7656
7657 EVT VT = N->getValueType(0);
7658 if (isTypeLegal(VT))
7659 Results.push_back(DAG.getNode(Opcode, SL, VT, Src0, Src1));
7660 else {
7661 SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
7662 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
7663 }
7664 return;
7665 }
7666 case Intrinsic::amdgcn_s_buffer_load: {
7667 // Lower llvm.amdgcn.s.buffer.load.(i8, u8) intrinsics. First, we generate
7668 // s_buffer_load_u8 for signed and unsigned load instructions. Next, DAG
7669 // combiner tries to merge the s_buffer_load_u8 with a sext instruction
7670 // (performSignExtendInRegCombine()) and it replaces s_buffer_load_u8 with
7671 // s_buffer_load_i8.
7672 if (!Subtarget->hasScalarSubwordLoads())
7673 return;
7674 SDValue Op = SDValue(N, 0);
7675 SDValue Rsrc = Op.getOperand(1);
7676 SDValue Offset = Op.getOperand(2);
7677 SDValue CachePolicy = Op.getOperand(3);
7678 EVT VT = Op.getValueType();
7679 assert(VT == MVT::i8 && "Expected 8-bit s_buffer_load intrinsics.\n");
7680 SDLoc DL(Op);
7682 const DataLayout &DataLayout = DAG.getDataLayout();
7683 Align Alignment =
7689 VT.getStoreSize(), Alignment);
7690 SDValue LoadVal;
7691 if (!Offset->isDivergent()) {
7692 SDValue Ops[] = {Rsrc, // source register
7693 Offset, CachePolicy};
7694 SDValue BufferLoad =
7695 DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD_UBYTE, DL,
7696 DAG.getVTList(MVT::i32), Ops, VT, MMO);
7697 LoadVal = DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
7698 } else {
7699 SDValue Ops[] = {
7700 DAG.getEntryNode(), // Chain
7701 Rsrc, // rsrc
7702 DAG.getConstant(0, DL, MVT::i32), // vindex
7703 {}, // voffset
7704 {}, // soffset
7705 {}, // offset
7706 CachePolicy, // cachepolicy
7707 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
7708 };
7709 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
7710 LoadVal = handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
7711 }
7712 Results.push_back(LoadVal);
7713 return;
7714 }
7715 case Intrinsic::amdgcn_dead: {
7716 for (unsigned I = 0, E = N->getNumValues(); I < E; ++I)
7717 Results.push_back(DAG.getPOISON(N->getValueType(I)));
7718 return;
7719 }
7720 }
7721 break;
7722 }
7724 if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
7725 if (Res.getOpcode() == ISD::MERGE_VALUES) {
7726 // FIXME: Hacky
7727 for (unsigned I = 0; I < Res.getNumOperands(); I++) {
7728 Results.push_back(Res.getOperand(I));
7729 }
7730 } else {
7731 Results.push_back(Res);
7732 Results.push_back(Res.getValue(1));
7733 }
7734 return;
7735 }
7736
7737 break;
7738 }
7739 case ISD::SELECT: {
7740 SDLoc SL(N);
7741 EVT VT = N->getValueType(0);
7742 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
7743 SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
7744 SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
7745
7746 EVT SelectVT = NewVT;
7747 if (NewVT.bitsLT(MVT::i32)) {
7748 LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
7749 RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
7750 SelectVT = MVT::i32;
7751 }
7752
7753 SDValue NewSelect =
7754 DAG.getNode(ISD::SELECT, SL, SelectVT, N->getOperand(0), LHS, RHS);
7755
7756 if (NewVT != SelectVT)
7757 NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect);
7758 Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
7759 return;
7760 }
7761 case ISD::FNEG: {
7762 if (N->getValueType(0) != MVT::v2f16)
7763 break;
7764
7765 SDLoc SL(N);
7766 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
7767
7768 SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32, BC,
7769 DAG.getConstant(0x80008000, SL, MVT::i32));
7770 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
7771 return;
7772 }
7773 case ISD::FABS: {
7774 if (N->getValueType(0) != MVT::v2f16)
7775 break;
7776
7777 SDLoc SL(N);
7778 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
7779
7780 SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32, BC,
7781 DAG.getConstant(0x7fff7fff, SL, MVT::i32));
7782 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
7783 return;
7784 }
7785 case ISD::FSQRT: {
7786 if (N->getValueType(0) != MVT::f16)
7787 break;
7788 Results.push_back(lowerFSQRTF16(SDValue(N, 0), DAG));
7789 break;
7790 }
7791 default:
7793 break;
7794 }
7795}
7796
7797/// Helper function for LowerBRCOND
7798static SDNode *findUser(SDValue Value, unsigned Opcode) {
7799
7800 for (SDUse &U : Value->uses()) {
7801 if (U.get() != Value)
7802 continue;
7803
7804 if (U.getUser()->getOpcode() == Opcode)
7805 return U.getUser();
7806 }
7807 return nullptr;
7808}
7809
7810unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
7811 if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
7812 switch (Intr->getConstantOperandVal(1)) {
7813 case Intrinsic::amdgcn_if:
7814 return AMDGPUISD::IF;
7815 case Intrinsic::amdgcn_else:
7816 return AMDGPUISD::ELSE;
7817 case Intrinsic::amdgcn_loop:
7818 return AMDGPUISD::LOOP;
7819 case Intrinsic::amdgcn_end_cf:
7820 llvm_unreachable("should not occur");
7821 default:
7822 return 0;
7823 }
7824 }
7825
7826 // break, if_break, else_break are all only used as inputs to loop, not
7827 // directly as branch conditions.
7828 return 0;
7829}
7830
7837
7839 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
7840 return false;
7841
7842 // FIXME: Either avoid relying on address space here or change the default
7843 // address space for functions to avoid the explicit check.
7844 return (GV->getValueType()->isFunctionTy() ||
7847}
7848
7850 return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
7851}
7852
7854 if (!GV->hasExternalLinkage())
7855 return true;
7856
7857 const auto OS = getTargetMachine().getTargetTriple().getOS();
7858 return OS == Triple::AMDHSA || OS == Triple::AMDPAL;
7859}
7860
7861/// This transforms the control flow intrinsics to get the branch destination as
7862/// last parameter, also switches branch target with BR if the need arise
7863SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, SelectionDAG &DAG) const {
7864 SDLoc DL(BRCOND);
7865
7866 SDNode *Intr = BRCOND.getOperand(1).getNode();
7867 SDValue Target = BRCOND.getOperand(2);
7868 SDNode *BR = nullptr;
7869 SDNode *SetCC = nullptr;
7870
7871 switch (Intr->getOpcode()) {
7872 case ISD::SETCC: {
7873 // As long as we negate the condition everything is fine
7874 SetCC = Intr;
7875 Intr = SetCC->getOperand(0).getNode();
7876 break;
7877 }
7878 case ISD::XOR: {
7879 // Similar to SETCC, if we have (xor c, -1), we will be fine.
7880 SDValue LHS = Intr->getOperand(0);
7881 SDValue RHS = Intr->getOperand(1);
7882 if (auto *C = dyn_cast<ConstantSDNode>(RHS); C && C->getZExtValue()) {
7883 Intr = LHS.getNode();
7884 break;
7885 }
7886 [[fallthrough]];
7887 }
7888 default: {
7889 // Get the target from BR if we don't negate the condition
7890 BR = findUser(BRCOND, ISD::BR);
7891 assert(BR && "brcond missing unconditional branch user");
7892 Target = BR->getOperand(1);
7893 }
7894 }
7895
7896 unsigned CFNode = isCFIntrinsic(Intr);
7897 if (CFNode == 0) {
7898 // This is a uniform branch so we don't need to legalize.
7899 return BRCOND;
7900 }
7901
7902 bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
7904
7905 assert(!SetCC ||
7906 (SetCC->getConstantOperandVal(1) == 1 &&
7907 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
7908 ISD::SETNE));
7909
7910 // operands of the new intrinsic call
7912 if (HaveChain)
7913 Ops.push_back(BRCOND.getOperand(0));
7914
7915 Ops.append(Intr->op_begin() + (HaveChain ? 2 : 1), Intr->op_end());
7916 Ops.push_back(Target);
7917
7918 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
7919
7920 // build the new intrinsic call
7921 SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
7922
7923 if (!HaveChain) {
7924 SDValue Ops[] = {SDValue(Result, 0), BRCOND.getOperand(0)};
7925
7927 }
7928
7929 if (BR) {
7930 // Give the branch instruction our target
7931 SDValue Ops[] = {BR->getOperand(0), BRCOND.getOperand(2)};
7932 SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
7933 DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
7934 }
7935
7936 SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
7937
7938 // Copy the intrinsic results to registers
7939 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
7940 SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg);
7941 if (!CopyToReg)
7942 continue;
7943
7944 Chain = DAG.getCopyToReg(Chain, DL, CopyToReg->getOperand(1),
7945 SDValue(Result, i - 1), SDValue());
7946
7947 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
7948 }
7949
7950 // Remove the old intrinsic from the chain
7951 DAG.ReplaceAllUsesOfValueWith(SDValue(Intr, Intr->getNumValues() - 1),
7952 Intr->getOperand(0));
7953
7954 return Chain;
7955}
7956
7957SDValue SITargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const {
7958 MVT VT = Op.getSimpleValueType();
7959 SDLoc DL(Op);
7960 // Checking the depth
7961 if (Op.getConstantOperandVal(0) != 0)
7962 return DAG.getConstant(0, DL, VT);
7963
7964 MachineFunction &MF = DAG.getMachineFunction();
7965 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7966 // Check for kernel and shader functions
7967 if (Info->isEntryFunction())
7968 return DAG.getConstant(0, DL, VT);
7969
7970 MachineFrameInfo &MFI = MF.getFrameInfo();
7971 // There is a call to @llvm.returnaddress in this function
7972 MFI.setReturnAddressIsTaken(true);
7973
7974 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
7975 // Get the return address reg and mark it as an implicit live-in
7976 Register Reg = MF.addLiveIn(TRI->getReturnAddressReg(MF),
7977 getRegClassFor(VT, Op.getNode()->isDivergent()));
7978
7979 return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
7980}
7981
7982SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG, SDValue Op,
7983 const SDLoc &DL, EVT VT) const {
7984 return Op.getValueType().bitsLE(VT)
7985 ? DAG.getNode(ISD::FP_EXTEND, DL, VT, Op)
7986 : DAG.getNode(ISD::FP_ROUND, DL, VT, Op,
7987 DAG.getTargetConstant(0, DL, MVT::i32));
7988}
7989
7990SDValue SITargetLowering::splitFP_ROUNDVectorOp(SDValue Op,
7991 SelectionDAG &DAG) const {
7992 EVT DstVT = Op.getValueType();
7993 unsigned NumElts = DstVT.getVectorNumElements();
7994 assert(NumElts > 2 && isPowerOf2_32(NumElts));
7995
7996 auto [Lo, Hi] = DAG.SplitVectorOperand(Op.getNode(), 0);
7997
7998 SDLoc DL(Op);
7999 unsigned Opc = Op.getOpcode();
8000 SDValue Flags = Op.getOperand(1);
8001 EVT HalfDstVT =
8002 EVT::getVectorVT(*DAG.getContext(), DstVT.getScalarType(), NumElts / 2);
8003 SDValue OpLo = DAG.getNode(Opc, DL, HalfDstVT, Lo, Flags);
8004 SDValue OpHi = DAG.getNode(Opc, DL, HalfDstVT, Hi, Flags);
8005
8006 return DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, OpLo, OpHi);
8007}
8008
8009SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
8010 SDValue Src = Op.getOperand(0);
8011 EVT SrcVT = Src.getValueType();
8012 EVT DstVT = Op.getValueType();
8013
8014 if (DstVT.isVector() && DstVT.getScalarType() == MVT::f16) {
8015 assert(Subtarget->hasCvtPkF16F32Inst() && "support v_cvt_pk_f16_f32");
8016 if (SrcVT.getScalarType() != MVT::f32)
8017 return SDValue();
8018 return SrcVT == MVT::v2f32 ? Op : splitFP_ROUNDVectorOp(Op, DAG);
8019 }
8020
8021 if (SrcVT.getScalarType() != MVT::f64)
8022 return Op;
8023
8024 SDLoc DL(Op);
8025 if (DstVT == MVT::f16) {
8026 // TODO: Handle strictfp
8027 if (Op.getOpcode() != ISD::FP_ROUND)
8028 return Op;
8029
8030 if (!Subtarget->has16BitInsts()) {
8031 SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
8032 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
8033 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
8034 }
8035 if (Op->getFlags().hasApproximateFuncs()) {
8036 SDValue Flags = Op.getOperand(1);
8037 SDValue Src32 = DAG.getNode(ISD::FP_ROUND, DL, MVT::f32, Src, Flags);
8038 return DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, Src32, Flags);
8039 }
8040 SDValue FpToFp16 = LowerF64ToF16Safe(Src, DL, DAG);
8041 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
8042 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
8043 }
8044
8045 assert(DstVT.getScalarType() == MVT::bf16 &&
8046 "custom lower FP_ROUND for f16 or bf16");
8047 assert(Subtarget->hasBF16ConversionInsts() && "f32 -> bf16 is legal");
8048
8049 // Round-inexact-to-odd f64 to f32, then do the final rounding using the
8050 // hardware f32 -> bf16 instruction.
8051 EVT F32VT = SrcVT.changeElementType(*DAG.getContext(), MVT::f32);
8052 SDValue Rod = expandRoundInexactToOdd(F32VT, Src, DL, DAG);
8053 return DAG.getNode(ISD::FP_ROUND, DL, DstVT, Rod,
8054 DAG.getTargetConstant(0, DL, MVT::i32));
8055}
8056
8057SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
8058 SelectionDAG &DAG) const {
8059 EVT VT = Op.getValueType();
8060 const MachineFunction &MF = DAG.getMachineFunction();
8061 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8062 bool IsIEEEMode = Info->getMode().IEEE;
8063
8064 // FIXME: Assert during selection that this is only selected for
8065 // ieee_mode. Currently a combine can produce the ieee version for non-ieee
8066 // mode functions, but this happens to be OK since it's only done in cases
8067 // where there is known no sNaN.
8068 if (IsIEEEMode)
8069 return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
8070
8071 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
8072 VT == MVT::v16bf16)
8073 return splitBinaryVectorOp(Op, DAG);
8074 return Op;
8075}
8076
8077SDValue
8078SITargetLowering::lowerFMINIMUMNUM_FMAXIMUMNUM(SDValue Op,
8079 SelectionDAG &DAG) const {
8080 EVT VT = Op.getValueType();
8081 const MachineFunction &MF = DAG.getMachineFunction();
8082 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8083 bool IsIEEEMode = Info->getMode().IEEE;
8084
8085 if (IsIEEEMode)
8086 return expandFMINIMUMNUM_FMAXIMUMNUM(Op.getNode(), DAG);
8087
8088 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
8089 VT == MVT::v16bf16)
8090 return splitBinaryVectorOp(Op, DAG);
8091 return Op;
8092}
8093
8094SDValue SITargetLowering::lowerFMINIMUM_FMAXIMUM(SDValue Op,
8095 SelectionDAG &DAG) const {
8096 EVT VT = Op.getValueType();
8097 if (VT.isVector())
8098 return splitBinaryVectorOp(Op, DAG);
8099
8100 assert(!Subtarget->hasIEEEMinimumMaximumInsts() &&
8101 !Subtarget->hasMinimum3Maximum3F16() &&
8102 Subtarget->hasMinimum3Maximum3PKF16() && VT == MVT::f16 &&
8103 "should not need to widen f16 minimum/maximum to v2f16");
8104
8105 // Widen f16 operation to v2f16
8106
8107 // fminimum f16:x, f16:y ->
8108 // extract_vector_elt (fminimum (v2f16 (scalar_to_vector x))
8109 // (v2f16 (scalar_to_vector y))), 0
8110 SDLoc SL(Op);
8111 SDValue WideSrc0 =
8112 DAG.getNode(ISD::SCALAR_TO_VECTOR, SL, MVT::v2f16, Op.getOperand(0));
8113 SDValue WideSrc1 =
8114 DAG.getNode(ISD::SCALAR_TO_VECTOR, SL, MVT::v2f16, Op.getOperand(1));
8115
8116 SDValue Widened =
8117 DAG.getNode(Op.getOpcode(), SL, MVT::v2f16, WideSrc0, WideSrc1);
8118
8119 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::f16, Widened,
8120 DAG.getConstant(0, SL, MVT::i32));
8121}
8122
8123SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const {
8124 bool IsStrict = Op.getOpcode() == ISD::STRICT_FLDEXP;
8125 EVT VT = Op.getValueType();
8126 assert(VT == MVT::f16);
8127
8128 SDValue Exp = Op.getOperand(IsStrict ? 2 : 1);
8129 EVT ExpVT = Exp.getValueType();
8130 if (ExpVT == MVT::i16)
8131 return Op;
8132
8133 SDLoc DL(Op);
8134
8135 // Correct the exponent type for f16 to i16.
8136 // Clamp the range of the exponent to the instruction's range.
8137
8138 // TODO: This should be a generic narrowing legalization, and can easily be
8139 // for GlobalISel.
8140
8141 SDValue MinExp = DAG.getSignedConstant(minIntN(16), DL, ExpVT);
8142 SDValue ClampMin = DAG.getNode(ISD::SMAX, DL, ExpVT, Exp, MinExp);
8143
8144 SDValue MaxExp = DAG.getSignedConstant(maxIntN(16), DL, ExpVT);
8145 SDValue Clamp = DAG.getNode(ISD::SMIN, DL, ExpVT, ClampMin, MaxExp);
8146
8147 SDValue TruncExp = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Clamp);
8148
8149 if (IsStrict) {
8150 return DAG.getNode(ISD::STRICT_FLDEXP, DL, {VT, MVT::Other},
8151 {Op.getOperand(0), Op.getOperand(1), TruncExp});
8152 }
8153
8154 return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(0), TruncExp);
8155}
8156
8158 switch (Op->getOpcode()) {
8159 case ISD::SRA:
8160 case ISD::SMIN:
8161 case ISD::SMAX:
8162 return ISD::SIGN_EXTEND;
8163 case ISD::SRL:
8164 case ISD::UMIN:
8165 case ISD::UMAX:
8166 return ISD::ZERO_EXTEND;
8167 case ISD::ADD:
8168 case ISD::SUB:
8169 case ISD::AND:
8170 case ISD::OR:
8171 case ISD::XOR:
8172 case ISD::SHL:
8173 case ISD::SELECT:
8174 case ISD::MUL:
8175 // operation result won't be influenced by garbage high bits.
8176 // TODO: are all of those cases correct, and are there more?
8177 return ISD::ANY_EXTEND;
8178 case ISD::SETCC: {
8179 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
8181 }
8182 default:
8183 llvm_unreachable("unexpected opcode!");
8184 }
8185}
8186
8187SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op,
8188 DAGCombinerInfo &DCI) const {
8189 const unsigned Opc = Op.getOpcode();
8190 assert(Opc == ISD::ADD || Opc == ISD::SUB || Opc == ISD::SHL ||
8191 Opc == ISD::SRL || Opc == ISD::SRA || Opc == ISD::AND ||
8192 Opc == ISD::OR || Opc == ISD::XOR || Opc == ISD::MUL ||
8193 Opc == ISD::SETCC || Opc == ISD::SELECT || Opc == ISD::SMIN ||
8194 Opc == ISD::SMAX || Opc == ISD::UMIN || Opc == ISD::UMAX);
8195
8196 EVT OpTy = (Opc != ISD::SETCC) ? Op.getValueType()
8197 : Op->getOperand(0).getValueType();
8198 auto &DAG = DCI.DAG;
8199 auto ExtTy = OpTy.changeElementType(*DAG.getContext(), MVT::i32);
8200
8201 if (DCI.isBeforeLegalizeOps() ||
8202 isNarrowingProfitable(Op.getNode(), ExtTy, OpTy))
8203 return SDValue();
8204
8205 SDLoc DL(Op);
8206 SDValue LHS;
8207 SDValue RHS;
8208 if (Opc == ISD::SELECT) {
8209 LHS = Op->getOperand(1);
8210 RHS = Op->getOperand(2);
8211 } else {
8212 LHS = Op->getOperand(0);
8213 RHS = Op->getOperand(1);
8214 }
8215
8216 const unsigned ExtOp = getExtOpcodeForPromotedOp(Op);
8217 LHS = DAG.getNode(ExtOp, DL, ExtTy, {LHS});
8218
8219 // Special case: for shifts, the RHS always needs a zext.
8220 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
8221 RHS = DAG.getNode(ISD::ZERO_EXTEND, DL, ExtTy, {RHS});
8222 else
8223 RHS = DAG.getNode(ExtOp, DL, ExtTy, {RHS});
8224
8225 // setcc always return i1/i1 vec so no need to truncate after.
8226 if (Opc == ISD::SETCC) {
8227 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
8228 return DAG.getSetCC(DL, Op.getValueType(), LHS, RHS, CC);
8229 }
8230
8231 // For other ops, we extend the operation's return type as well so we need to
8232 // truncate back to the original type.
8233 SDValue NewVal;
8234 if (Opc == ISD::SELECT)
8235 NewVal = DAG.getNode(ISD::SELECT, DL, ExtTy, {Op->getOperand(0), LHS, RHS});
8236 else
8237 NewVal = DAG.getNode(Opc, DL, ExtTy, {LHS, RHS});
8238
8239 return DAG.getZExtOrTrunc(NewVal, DL, OpTy);
8240}
8241
8242SDValue SITargetLowering::lowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
8243 SDValue Mag = Op.getOperand(0);
8244 EVT MagVT = Mag.getValueType();
8245
8246 if (MagVT.getVectorNumElements() > 2)
8247 return splitBinaryVectorOp(Op, DAG);
8248
8249 SDValue Sign = Op.getOperand(1);
8250 EVT SignVT = Sign.getValueType();
8251
8252 if (MagVT == SignVT)
8253 return Op;
8254
8255 // fcopysign v2f16:mag, v2f32:sign ->
8256 // fcopysign v2f16:mag, bitcast (trunc (bitcast sign to v2i32) to v2i16)
8257
8258 SDLoc SL(Op);
8259 SDValue SignAsInt32 = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Sign);
8260 SDValue SignAsInt16 = DAG.getNode(ISD::TRUNCATE, SL, MVT::v2i16, SignAsInt32);
8261
8262 SDValue SignAsHalf16 = DAG.getNode(ISD::BITCAST, SL, MagVT, SignAsInt16);
8263
8264 return DAG.getNode(ISD::FCOPYSIGN, SL, MagVT, Mag, SignAsHalf16);
8265}
8266
8267// Custom lowering for vector multiplications and s_mul_u64.
8268SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const {
8269 EVT VT = Op.getValueType();
8270
8271 // Split vector operands.
8272 if (VT.isVector())
8273 return splitBinaryVectorOp(Op, DAG);
8274
8275 assert(VT == MVT::i64 && "The following code is a special for s_mul_u64");
8276
8277 // There are four ways to lower s_mul_u64:
8278 //
8279 // 1. If all the operands are uniform, then we lower it as it is.
8280 //
8281 // 2. If the operands are divergent, then we have to split s_mul_u64 in 32-bit
8282 // multiplications because there is not a vector equivalent of s_mul_u64.
8283 //
8284 // 3. If the cost model decides that it is more efficient to use vector
8285 // registers, then we have to split s_mul_u64 in 32-bit multiplications.
8286 // This happens in splitScalarSMULU64() in SIInstrInfo.cpp .
8287 //
8288 // 4. If the cost model decides to use vector registers and both of the
8289 // operands are zero-extended/sign-extended from 32-bits, then we split the
8290 // s_mul_u64 in two 32-bit multiplications. The problem is that it is not
8291 // possible to check if the operands are zero-extended or sign-extended in
8292 // SIInstrInfo.cpp. For this reason, here, we replace s_mul_u64 with
8293 // s_mul_u64_u32_pseudo if both operands are zero-extended and we replace
8294 // s_mul_u64 with s_mul_i64_i32_pseudo if both operands are sign-extended.
8295 // If the cost model decides that we have to use vector registers, then
8296 // splitScalarSMulPseudo() (in SIInstrInfo.cpp) split s_mul_u64_u32/
8297 // s_mul_i64_i32_pseudo in two vector multiplications. If the cost model
8298 // decides that we should use scalar registers, then s_mul_u64_u32_pseudo/
8299 // s_mul_i64_i32_pseudo is lowered as s_mul_u64 in expandPostRAPseudo() in
8300 // SIInstrInfo.cpp .
8301
8302 if (Op->isDivergent())
8303 return SDValue();
8304
8305 SDValue Op0 = Op.getOperand(0);
8306 SDValue Op1 = Op.getOperand(1);
8307 // If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64
8308 // with s_mul_u64_u32_pseudo. If all the operands are sign-extended to
8309 // 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo.
8310 KnownBits Op0KnownBits = DAG.computeKnownBits(Op0);
8311 unsigned Op0LeadingZeros = Op0KnownBits.countMinLeadingZeros();
8312 KnownBits Op1KnownBits = DAG.computeKnownBits(Op1);
8313 unsigned Op1LeadingZeros = Op1KnownBits.countMinLeadingZeros();
8314 SDLoc SL(Op);
8315 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
8316 return SDValue(
8317 DAG.getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
8318 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op0);
8319 unsigned Op1SignBits = DAG.ComputeNumSignBits(Op1);
8320 if (Op0SignBits >= 33 && Op1SignBits >= 33)
8321 return SDValue(
8322 DAG.getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
8323 // If all the operands are uniform, then we lower s_mul_u64 as it is.
8324 return Op;
8325}
8326
8327SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const {
8328 EVT VT = Op.getValueType();
8329 SDLoc SL(Op);
8330 SDValue LHS = Op.getOperand(0);
8331 SDValue RHS = Op.getOperand(1);
8332 bool isSigned = Op.getOpcode() == ISD::SMULO;
8333
8334 if (ConstantSDNode *RHSC = isConstOrConstSplat(RHS)) {
8335 const APInt &C = RHSC->getAPIntValue();
8336 // mulo(X, 1 << S) -> { X << S, (X << S) >> S != X }
8337 if (C.isPowerOf2()) {
8338 // smulo(x, signed_min) is same as umulo(x, signed_min).
8339 bool UseArithShift = isSigned && !C.isMinSignedValue();
8340 SDValue ShiftAmt = DAG.getConstant(C.logBase2(), SL, MVT::i32);
8341 SDValue Result = DAG.getNode(ISD::SHL, SL, VT, LHS, ShiftAmt);
8342 SDValue Overflow =
8343 DAG.getSetCC(SL, MVT::i1,
8344 DAG.getNode(UseArithShift ? ISD::SRA : ISD::SRL, SL, VT,
8345 Result, ShiftAmt),
8346 LHS, ISD::SETNE);
8347 return DAG.getMergeValues({Result, Overflow}, SL);
8348 }
8349 }
8350
8351 SDValue Result = DAG.getNode(ISD::MUL, SL, VT, LHS, RHS);
8352 SDValue Top =
8353 DAG.getNode(isSigned ? ISD::MULHS : ISD::MULHU, SL, VT, LHS, RHS);
8354
8355 SDValue Sign = isSigned
8356 ? DAG.getNode(ISD::SRA, SL, VT, Result,
8357 DAG.getConstant(VT.getScalarSizeInBits() - 1,
8358 SL, MVT::i32))
8359 : DAG.getConstant(0, SL, VT);
8360 SDValue Overflow = DAG.getSetCC(SL, MVT::i1, Top, Sign, ISD::SETNE);
8361
8362 return DAG.getMergeValues({Result, Overflow}, SL);
8363}
8364
8365SDValue SITargetLowering::lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const {
8366 if (Op->isDivergent()) {
8367 // Select to V_MAD_[IU]64_[IU]32.
8368 return Op;
8369 }
8370 if (Subtarget->hasSMulHi()) {
8371 // Expand to S_MUL_I32 + S_MUL_HI_[IU]32.
8372 return SDValue();
8373 }
8374 // The multiply is uniform but we would have to use V_MUL_HI_[IU]32 to
8375 // calculate the high part, so we might as well do the whole thing with
8376 // V_MAD_[IU]64_[IU]32.
8377 return Op;
8378}
8379
8380SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
8381 if (!Subtarget->hasTrapHandler() ||
8382 Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
8383 return lowerTrapEndpgm(Op, DAG);
8384
8385 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(Op, DAG)
8386 : lowerTrapHsaQueuePtr(Op, DAG);
8387}
8388
8389SDValue SITargetLowering::lowerTrapEndpgm(SDValue Op, SelectionDAG &DAG) const {
8390 SDLoc SL(Op);
8391 SDValue Chain = Op.getOperand(0);
8392 return DAG.getNode(AMDGPUISD::ENDPGM_TRAP, SL, MVT::Other, Chain);
8393}
8394
8395SDValue
8396SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT,
8397 const SDLoc &DL, Align Alignment,
8398 ImplicitParameter Param) const {
8399 MachineFunction &MF = DAG.getMachineFunction();
8400 uint64_t Offset = getImplicitParameterOffset(MF, Param);
8401 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), Offset);
8402 MachinePointerInfo PtrInfo =
8404 return DAG.getLoad(
8405 VT, DL, DAG.getEntryNode(), Ptr, PtrInfo.getWithOffset(Offset), Alignment,
8407}
8408
8409SDValue SITargetLowering::lowerTrapHsaQueuePtr(SDValue Op,
8410 SelectionDAG &DAG) const {
8411 SDLoc SL(Op);
8412 SDValue Chain = Op.getOperand(0);
8413
8414 SDValue QueuePtr;
8415 // For code object version 5, QueuePtr is passed through implicit kernarg.
8416 const Module *M = DAG.getMachineFunction().getFunction().getParent();
8418 QueuePtr =
8419 loadImplicitKernelArgument(DAG, MVT::i64, SL, Align(8), QUEUE_PTR);
8420 } else {
8421 MachineFunction &MF = DAG.getMachineFunction();
8422 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8423 Register UserSGPR = Info->getQueuePtrUserSGPR();
8424
8425 if (UserSGPR == AMDGPU::NoRegister) {
8426 // We probably are in a function incorrectly marked with
8427 // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the
8428 // trap, so just use a null pointer.
8429 QueuePtr = DAG.getConstant(0, SL, MVT::i64);
8430 } else {
8431 QueuePtr = CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR,
8432 MVT::i64);
8433 }
8434 }
8435
8436 SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
8437 SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01, QueuePtr, SDValue());
8438
8439 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
8440 SDValue Ops[] = {ToReg, DAG.getTargetConstant(TrapID, SL, MVT::i16), SGPR01,
8441 ToReg.getValue(1)};
8442 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
8443}
8444
8445SDValue SITargetLowering::lowerTrapHsa(SDValue Op, SelectionDAG &DAG) const {
8446 SDLoc SL(Op);
8447 SDValue Chain = Op.getOperand(0);
8448
8449 // We need to simulate the 's_trap 2' instruction on targets that run in
8450 // PRIV=1 (where it is treated as a nop).
8451 if (Subtarget->hasPrivEnabledTrap2NopBug())
8452 return DAG.getNode(AMDGPUISD::SIMULATED_TRAP, SL, MVT::Other, Chain);
8453
8454 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
8455 SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)};
8456 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
8457}
8458
8459SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
8460 SDLoc SL(Op);
8461 SDValue Chain = Op.getOperand(0);
8462 MachineFunction &MF = DAG.getMachineFunction();
8463
8464 if (!Subtarget->hasTrapHandler() ||
8465 Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
8466 LLVMContext &Ctx = MF.getFunction().getContext();
8467 Ctx.diagnose(DiagnosticInfoUnsupported(MF.getFunction(),
8468 "debugtrap handler not supported",
8469 Op.getDebugLoc(), DS_Warning));
8470 return Chain;
8471 }
8472
8473 uint64_t TrapID =
8474 static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap);
8475 SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)};
8476 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
8477}
8478
8479SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
8480 SelectionDAG &DAG) const {
8481 if (Subtarget->hasApertureRegs()) {
8482 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
8483 ? AMDGPU::SRC_SHARED_BASE
8484 : AMDGPU::SRC_PRIVATE_BASE;
8485 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
8486 !Subtarget->hasGloballyAddressableScratch()) &&
8487 "Cannot use src_private_base with globally addressable scratch!");
8488 // Note: this feature (register) is broken. When used as a 32-bit operand,
8489 // it returns a wrong value (all zeroes?). The real value is in the upper 32
8490 // bits.
8491 //
8492 // To work around the issue, emit a 64 bit copy from this register
8493 // then extract the high bits. Note that this shouldn't even result in a
8494 // shift being emitted and simply become a pair of registers (e.g.):
8495 // s_mov_b64 s[6:7], src_shared_base
8496 // v_mov_b32_e32 v1, s7
8497 SDValue Copy =
8498 DAG.getCopyFromReg(DAG.getEntryNode(), DL, ApertureRegNo, MVT::v2i32);
8499 return DAG.getExtractVectorElt(DL, MVT::i32, Copy, 1);
8500 }
8501
8502 // For code object version 5, private_base and shared_base are passed through
8503 // implicit kernargs.
8504 const Module *M = DAG.getMachineFunction().getFunction().getParent();
8508 return loadImplicitKernelArgument(DAG, MVT::i32, DL, Align(4), Param);
8509 }
8510
8511 MachineFunction &MF = DAG.getMachineFunction();
8512 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8513 Register UserSGPR = Info->getQueuePtrUserSGPR();
8514 if (UserSGPR == AMDGPU::NoRegister) {
8515 // We probably are in a function incorrectly marked with
8516 // amdgpu-no-queue-ptr. This is undefined.
8517 return DAG.getPOISON(MVT::i32);
8518 }
8519
8520 SDValue QueuePtr =
8521 CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
8522
8523 // Offset into amd_queue_t for group_segment_aperture_base_hi /
8524 // private_segment_aperture_base_hi.
8525 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
8526
8527 SDValue Ptr =
8528 DAG.getObjectPtrOffset(DL, QueuePtr, TypeSize::getFixed(StructOffset));
8529
8530 // TODO: Use custom target PseudoSourceValue.
8531 // TODO: We should use the value from the IR intrinsic call, but it might not
8532 // be available and how do we get it?
8533 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
8534 return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
8535 commonAlignment(Align(64), StructOffset),
8538}
8539
8540/// Return true if the value is a known valid address, such that a null check is
8541/// not necessary.
8543 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
8545 return true;
8546
8547 if (auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
8548 return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace);
8549
8550 // TODO: Search through arithmetic, handle arguments and loads
8551 // marked nonnull.
8552 return false;
8553}
8554
8555SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
8556 SelectionDAG &DAG) const {
8557 SDLoc SL(Op);
8558
8559 const AMDGPUTargetMachine &TM =
8560 static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
8561
8562 unsigned DestAS, SrcAS;
8563 SDValue Src;
8564 bool IsNonNull = false;
8565 if (const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(Op)) {
8566 SrcAS = ASC->getSrcAddressSpace();
8567 Src = ASC->getOperand(0);
8568 DestAS = ASC->getDestAddressSpace();
8569 } else {
8570 assert(Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
8571 Op.getConstantOperandVal(0) ==
8572 Intrinsic::amdgcn_addrspacecast_nonnull);
8573 Src = Op->getOperand(1);
8574 SrcAS = Op->getConstantOperandVal(2);
8575 DestAS = Op->getConstantOperandVal(3);
8576 IsNonNull = true;
8577 }
8578
8579 SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
8580
8581 // flat -> local/private
8582 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
8583 if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
8584 DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
8585 SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
8586
8587 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
8588 Subtarget->hasGloballyAddressableScratch()) {
8589 // flat -> private with globally addressable scratch: subtract
8590 // src_flat_scratch_base_lo.
8591 SDValue FlatScratchBaseLo(
8592 DAG.getMachineNode(
8593 AMDGPU::S_MOV_B32, SL, MVT::i32,
8594 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, MVT::i32)),
8595 0);
8596 Ptr = DAG.getNode(ISD::SUB, SL, MVT::i32, Ptr, FlatScratchBaseLo);
8597 }
8598
8599 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
8600 return Ptr;
8601
8602 unsigned NullVal = TM.getNullPointerValue(DestAS);
8603 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
8604 SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
8605
8606 return DAG.getNode(ISD::SELECT, SL, MVT::i32, NonNull, Ptr,
8607 SegmentNullPtr);
8608 }
8609 }
8610
8611 // local/private -> flat
8612 if (DestAS == AMDGPUAS::FLAT_ADDRESS) {
8613 if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
8614 SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
8615 SDValue CvtPtr;
8616 if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
8617 Subtarget->hasGloballyAddressableScratch()) {
8618 // For wave32: Addr = (TID[4:0] << 52) + FLAT_SCRATCH_BASE + privateAddr
8619 // For wave64: Addr = (TID[5:0] << 51) + FLAT_SCRATCH_BASE + privateAddr
8620 SDValue AllOnes = DAG.getSignedTargetConstant(-1, SL, MVT::i32);
8621 SDValue ThreadID = DAG.getConstant(0, SL, MVT::i32);
8622 ThreadID = DAG.getNode(
8623 ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
8624 DAG.getTargetConstant(Intrinsic::amdgcn_mbcnt_lo, SL, MVT::i32),
8625 AllOnes, ThreadID);
8626 if (Subtarget->isWave64())
8627 ThreadID = DAG.getNode(
8628 ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
8629 DAG.getTargetConstant(Intrinsic::amdgcn_mbcnt_hi, SL, MVT::i32),
8630 AllOnes, ThreadID);
8631 SDValue ShAmt = DAG.getShiftAmountConstant(
8632 57 - 32 - Subtarget->getWavefrontSizeLog2(), MVT::i32, SL);
8633 SDValue SrcHi = DAG.getNode(ISD::SHL, SL, MVT::i32, ThreadID, ShAmt);
8634 CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, SrcHi);
8635 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
8636 // Accessing src_flat_scratch_base_lo as a 64-bit operand gives the full
8637 // 64-bit hi:lo value.
8638 SDValue FlatScratchBase = {
8639 DAG.getMachineNode(
8640 AMDGPU::S_MOV_B64, SL, MVT::i64,
8641 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE, MVT::i64)),
8642 0};
8643 CvtPtr = DAG.getNode(ISD::ADD, SL, MVT::i64, CvtPtr, FlatScratchBase);
8644 } else {
8645 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
8646 CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
8647 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
8648 }
8649
8650 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
8651 return CvtPtr;
8652
8653 unsigned NullVal = TM.getNullPointerValue(SrcAS);
8654 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
8655
8656 SDValue NonNull =
8657 DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
8658
8659 return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull, CvtPtr,
8660 FlatNullPtr);
8661 }
8662 }
8663
8664 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
8665 Op.getValueType() == MVT::i64) {
8666 const SIMachineFunctionInfo *Info =
8667 DAG.getMachineFunction().getInfo<SIMachineFunctionInfo>();
8668 if (Info->get32BitAddressHighBits() == 0)
8669 return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, Src);
8670
8671 SDValue Hi = DAG.getConstant(Info->get32BitAddressHighBits(), SL, MVT::i32);
8672 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Hi);
8673 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
8674 }
8675
8676 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
8677 Src.getValueType() == MVT::i64)
8678 return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
8679
8680 // global <-> flat are no-ops and never emitted.
8681
8682 // Invalid casts are poison.
8683 return DAG.getPOISON(Op->getValueType(0));
8684}
8685
8686// This lowers an INSERT_SUBVECTOR by extracting the individual elements from
8687// the small vector and inserting them into the big vector. That is better than
8688// the default expansion of doing it via a stack slot. Even though the use of
8689// the stack slot would be optimized away afterwards, the stack slot itself
8690// remains.
8691SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
8692 SelectionDAG &DAG) const {
8693 SDValue Vec = Op.getOperand(0);
8694 SDValue Ins = Op.getOperand(1);
8695 SDValue Idx = Op.getOperand(2);
8696 EVT VecVT = Vec.getValueType();
8697 EVT InsVT = Ins.getValueType();
8698 EVT EltVT = VecVT.getVectorElementType();
8699 unsigned InsNumElts = InsVT.getVectorNumElements();
8700 unsigned IdxVal = Idx->getAsZExtVal();
8701 SDLoc SL(Op);
8702
8703 if (EltVT.getScalarSizeInBits() == 16 && IdxVal % 2 == 0) {
8704 // Insert 32-bit registers at a time.
8705 assert(InsNumElts % 2 == 0 && "expect legal vector types");
8706
8707 unsigned VecNumElts = VecVT.getVectorNumElements();
8708 EVT NewVecVT =
8709 EVT::getVectorVT(*DAG.getContext(), MVT::i32, VecNumElts / 2);
8710 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
8712 MVT::i32, InsNumElts / 2);
8713
8714 Vec = DAG.getNode(ISD::BITCAST, SL, NewVecVT, Vec);
8715 Ins = DAG.getNode(ISD::BITCAST, SL, NewInsVT, Ins);
8716
8717 for (unsigned I = 0; I != InsNumElts / 2; ++I) {
8718 SDValue Elt;
8719 if (InsNumElts == 2) {
8720 Elt = Ins;
8721 } else {
8722 Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Ins,
8723 DAG.getConstant(I, SL, MVT::i32));
8724 }
8725 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, NewVecVT, Vec, Elt,
8726 DAG.getConstant(IdxVal / 2 + I, SL, MVT::i32));
8727 }
8728
8729 return DAG.getNode(ISD::BITCAST, SL, VecVT, Vec);
8730 }
8731
8732 for (unsigned I = 0; I != InsNumElts; ++I) {
8733 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Ins,
8734 DAG.getConstant(I, SL, MVT::i32));
8735 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, VecVT, Vec, Elt,
8736 DAG.getConstant(IdxVal + I, SL, MVT::i32));
8737 }
8738 return Vec;
8739}
8740
8741SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
8742 SelectionDAG &DAG) const {
8743 SDValue Vec = Op.getOperand(0);
8744 SDValue InsVal = Op.getOperand(1);
8745 SDValue Idx = Op.getOperand(2);
8746 EVT VecVT = Vec.getValueType();
8747 EVT EltVT = VecVT.getVectorElementType();
8748 unsigned VecSize = VecVT.getSizeInBits();
8749 unsigned EltSize = EltVT.getSizeInBits();
8750 SDLoc SL(Op);
8751
8752 // Specially handle the case of v4i16 with static indexing.
8753 unsigned NumElts = VecVT.getVectorNumElements();
8754 auto *KIdx = dyn_cast<ConstantSDNode>(Idx);
8755 if (NumElts == 4 && EltSize == 16 && KIdx) {
8756 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
8757
8758 SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
8759 DAG.getConstant(0, SL, MVT::i32));
8760 SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
8761 DAG.getConstant(1, SL, MVT::i32));
8762
8763 SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
8764 SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
8765
8766 unsigned Idx = KIdx->getZExtValue();
8767 bool InsertLo = Idx < 2;
8768 SDValue InsHalf = DAG.getNode(
8769 ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16, InsertLo ? LoVec : HiVec,
8770 DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
8771 DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
8772
8773 InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
8774
8775 SDValue Concat =
8776 InsertLo ? DAG.getBuildVector(MVT::v2i32, SL, {InsHalf, HiHalf})
8777 : DAG.getBuildVector(MVT::v2i32, SL, {LoHalf, InsHalf});
8778
8779 return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
8780 }
8781
8782 // Static indexing does not lower to stack access, and hence there is no need
8783 // for special custom lowering to avoid stack access.
8784 if (isa<ConstantSDNode>(Idx))
8785 return SDValue();
8786
8787 // Avoid stack access for dynamic indexing by custom lowering to
8788 // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
8789
8790 assert(VecSize <= 64 && "Expected target vector size to be <= 64 bits");
8791
8792 MVT IntVT = MVT::getIntegerVT(VecSize);
8793
8794 // Convert vector index to bit-index and get the required bit mask.
8795 assert(isPowerOf2_32(EltSize));
8796 const auto EltMask = maskTrailingOnes<uint64_t>(EltSize);
8797 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
8798 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
8799 SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
8800 DAG.getConstant(EltMask, SL, IntVT), ScaledIdx);
8801
8802 // 1. Create a congruent vector with the target value in each element.
8803 SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT,
8804 DAG.getSplatBuildVector(VecVT, SL, InsVal));
8805
8806 // 2. Mask off all other indices except the required index within (1).
8807 SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
8808
8809 // 3. Mask off the required index within the target vector.
8810 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
8811 SDValue RHS =
8812 DAG.getNode(ISD::AND, SL, IntVT, DAG.getNOT(SL, BFM, IntVT), BCVec);
8813
8814 // 4. Get (2) and (3) ORed into the target vector.
8815 SDValue BFI =
8816 DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS, SDNodeFlags::Disjoint);
8817
8818 return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
8819}
8820
8821SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
8822 SelectionDAG &DAG) const {
8823 SDLoc SL(Op);
8824
8825 EVT ResultVT = Op.getValueType();
8826 SDValue Vec = Op.getOperand(0);
8827 SDValue Idx = Op.getOperand(1);
8828 EVT VecVT = Vec.getValueType();
8829 unsigned VecSize = VecVT.getSizeInBits();
8830 EVT EltVT = VecVT.getVectorElementType();
8831
8832 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
8833
8834 // Make sure we do any optimizations that will make it easier to fold
8835 // source modifiers before obscuring it with bit operations.
8836
8837 // XXX - Why doesn't this get called when vector_shuffle is expanded?
8838 if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
8839 return Combined;
8840
8841 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
8842 SDValue Lo, Hi;
8843 auto [LoVT, HiVT] = DAG.GetSplitDestVTs(VecVT);
8844
8845 if (VecSize == 128) {
8846 SDValue V2 = DAG.getBitcast(MVT::v2i64, Vec);
8847 Lo = DAG.getBitcast(LoVT,
8848 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8849 DAG.getConstant(0, SL, MVT::i32)));
8850 Hi = DAG.getBitcast(HiVT,
8851 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8852 DAG.getConstant(1, SL, MVT::i32)));
8853 } else if (VecSize == 256) {
8854 SDValue V2 = DAG.getBitcast(MVT::v4i64, Vec);
8855 SDValue Parts[4];
8856 for (unsigned P = 0; P < 4; ++P) {
8857 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8858 DAG.getConstant(P, SL, MVT::i32));
8859 }
8860
8861 Lo = DAG.getBitcast(LoVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
8862 Parts[0], Parts[1]));
8863 Hi = DAG.getBitcast(HiVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
8864 Parts[2], Parts[3]));
8865 } else {
8866 assert(VecSize == 512);
8867
8868 SDValue V2 = DAG.getBitcast(MVT::v8i64, Vec);
8869 SDValue Parts[8];
8870 for (unsigned P = 0; P < 8; ++P) {
8871 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8872 DAG.getConstant(P, SL, MVT::i32));
8873 }
8874
8875 Lo = DAG.getBitcast(LoVT,
8876 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
8877 Parts[0], Parts[1], Parts[2], Parts[3]));
8878 Hi = DAG.getBitcast(HiVT,
8879 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
8880 Parts[4], Parts[5], Parts[6], Parts[7]));
8881 }
8882
8883 EVT IdxVT = Idx.getValueType();
8884 unsigned NElem = VecVT.getVectorNumElements();
8885 assert(isPowerOf2_32(NElem));
8886 SDValue IdxMask = DAG.getConstant(NElem / 2 - 1, SL, IdxVT);
8887 SDValue NewIdx = DAG.getNode(ISD::AND, SL, IdxVT, Idx, IdxMask);
8888 SDValue Half = DAG.getSelectCC(SL, Idx, IdxMask, Hi, Lo, ISD::SETUGT);
8889 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Half, NewIdx);
8890 }
8891
8892 assert(VecSize <= 64);
8893
8894 MVT IntVT = MVT::getIntegerVT(VecSize);
8895
8896 // If Vec is just a SCALAR_TO_VECTOR, then use the scalar integer directly.
8897 SDValue VecBC = peekThroughBitcasts(Vec);
8898 if (VecBC.getOpcode() == ISD::SCALAR_TO_VECTOR) {
8899 SDValue Src = VecBC.getOperand(0);
8900 Src = DAG.getBitcast(Src.getValueType().changeTypeToInteger(), Src);
8901 Vec = DAG.getAnyExtOrTrunc(Src, SL, IntVT);
8902 }
8903
8904 unsigned EltSize = EltVT.getSizeInBits();
8905 assert(isPowerOf2_32(EltSize));
8906
8907 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
8908
8909 // Convert vector index to bit-index (* EltSize)
8910 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
8911
8912 SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
8913 SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
8914
8915 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
8916 SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
8917 return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
8918 }
8919
8920 return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
8921}
8922
8923static bool elementPairIsContiguous(ArrayRef<int> Mask, int Elt) {
8924 assert(Elt % 2 == 0);
8925 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
8926}
8927
8928static bool elementPairIsOddToEven(ArrayRef<int> Mask, int Elt) {
8929 assert(Elt % 2 == 0);
8930 return Mask[Elt] >= 0 && Mask[Elt + 1] >= 0 && (Mask[Elt] & 1) &&
8931 !(Mask[Elt + 1] & 1);
8932}
8933
8934SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
8935 SelectionDAG &DAG) const {
8936 SDLoc SL(Op);
8937 EVT ResultVT = Op.getValueType();
8938 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
8939 MVT EltVT = ResultVT.getVectorElementType().getSimpleVT();
8940 const int NewSrcNumElts = 2;
8941 MVT PackVT = MVT::getVectorVT(EltVT, NewSrcNumElts);
8942 int SrcNumElts = Op.getOperand(0).getValueType().getVectorNumElements();
8943
8944 // Break up the shuffle into registers sized pieces.
8945 //
8946 // We're trying to form sub-shuffles that the register allocation pipeline
8947 // won't be able to figure out, like how to use v_pk_mov_b32 to do a register
8948 // blend or 16-bit op_sel. It should be able to figure out how to reassemble a
8949 // pair of copies into a consecutive register copy, so use the ordinary
8950 // extract_vector_elt lowering unless we can use the shuffle.
8951 //
8952 // TODO: This is a bit of hack, and we should probably always use
8953 // extract_subvector for the largest possible subvector we can (or at least
8954 // use it for PackVT aligned pieces). However we have worse support for
8955 // combines on them don't directly treat extract_subvector / insert_subvector
8956 // as legal. The DAG scheduler also ends up doing a worse job with the
8957 // extract_subvectors.
8958 const bool ShouldUseConsecutiveExtract = EltVT.getSizeInBits() == 16;
8959
8960 // vector_shuffle <0,1,6,7> lhs, rhs
8961 // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2)
8962 //
8963 // vector_shuffle <6,7,2,3> lhs, rhs
8964 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2)
8965 //
8966 // vector_shuffle <6,7,0,1> lhs, rhs
8967 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0)
8968
8969 // Avoid scalarizing when both halves are reading from consecutive elements.
8970
8971 // If we're treating 2 element shuffles as legal, also create odd-to-even
8972 // shuffles of neighboring pairs.
8973 //
8974 // vector_shuffle <3,2,7,6> lhs, rhs
8975 // -> concat_vectors vector_shuffle <1, 0> (extract_subvector lhs, 0)
8976 // vector_shuffle <1, 0> (extract_subvector rhs, 2)
8977
8979 for (int I = 0, N = ResultVT.getVectorNumElements(); I != N; I += 2) {
8980 if (ShouldUseConsecutiveExtract &&
8982 const int Idx = SVN->getMaskElt(I);
8983 int VecIdx = Idx < SrcNumElts ? 0 : 1;
8984 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
8985 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT,
8986 SVN->getOperand(VecIdx),
8987 DAG.getConstant(EltIdx, SL, MVT::i32));
8988 Pieces.push_back(SubVec);
8989 } else if (elementPairIsOddToEven(SVN->getMask(), I) &&
8991 int Idx0 = SVN->getMaskElt(I);
8992 int Idx1 = SVN->getMaskElt(I + 1);
8993
8994 SDValue SrcOp0 = SVN->getOperand(0);
8995 SDValue SrcOp1 = SrcOp0;
8996 if (Idx0 >= SrcNumElts) {
8997 SrcOp0 = SVN->getOperand(1);
8998 Idx0 -= SrcNumElts;
8999 }
9000
9001 if (Idx1 >= SrcNumElts) {
9002 SrcOp1 = SVN->getOperand(1);
9003 Idx1 -= SrcNumElts;
9004 }
9005
9006 int AlignedIdx0 = Idx0 & ~(NewSrcNumElts - 1);
9007 int AlignedIdx1 = Idx1 & ~(NewSrcNumElts - 1);
9008
9009 // Extract nearest even aligned piece.
9010 SDValue SubVec0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT, SrcOp0,
9011 DAG.getConstant(AlignedIdx0, SL, MVT::i32));
9012 SDValue SubVec1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT, SrcOp1,
9013 DAG.getConstant(AlignedIdx1, SL, MVT::i32));
9014
9015 int NewMaskIdx0 = Idx0 - AlignedIdx0;
9016 int NewMaskIdx1 = Idx1 - AlignedIdx1;
9017
9018 SDValue Result0 = SubVec0;
9019 SDValue Result1 = SubVec0;
9020
9021 if (SubVec0 != SubVec1) {
9022 NewMaskIdx1 += NewSrcNumElts;
9023 Result1 = SubVec1;
9024 } else {
9025 Result1 = DAG.getPOISON(PackVT);
9026 }
9027
9028 SDValue Shuf = DAG.getVectorShuffle(PackVT, SL, Result0, Result1,
9029 {NewMaskIdx0, NewMaskIdx1});
9030 Pieces.push_back(Shuf);
9031 } else {
9032 const int Idx0 = SVN->getMaskElt(I);
9033 const int Idx1 = SVN->getMaskElt(I + 1);
9034 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
9035 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
9036 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
9037 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
9038
9039 SDValue Vec0 = SVN->getOperand(VecIdx0);
9040 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec0,
9041 DAG.getSignedConstant(EltIdx0, SL, MVT::i32));
9042
9043 SDValue Vec1 = SVN->getOperand(VecIdx1);
9044 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec1,
9045 DAG.getSignedConstant(EltIdx1, SL, MVT::i32));
9046 Pieces.push_back(DAG.getBuildVector(PackVT, SL, {Elt0, Elt1}));
9047 }
9048 }
9049
9050 return DAG.getNode(ISD::CONCAT_VECTORS, SL, ResultVT, Pieces);
9051}
9052
9053SDValue SITargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op,
9054 SelectionDAG &DAG) const {
9055 SDValue SVal = Op.getOperand(0);
9056 EVT ResultVT = Op.getValueType();
9057 EVT SValVT = SVal.getValueType();
9058 SDValue UndefVal = DAG.getPOISON(SValVT);
9059 SDLoc SL(Op);
9060
9062 VElts.push_back(SVal);
9063 for (int I = 1, E = ResultVT.getVectorNumElements(); I < E; ++I)
9064 VElts.push_back(UndefVal);
9065
9066 return DAG.getBuildVector(ResultVT, SL, VElts);
9067}
9068
9069SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
9070 SelectionDAG &DAG) const {
9071 SDLoc SL(Op);
9072 EVT VT = Op.getValueType();
9073
9074 if (VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16) {
9075 assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
9076
9077 SDValue Lo = Op.getOperand(0);
9078 SDValue Hi = Op.getOperand(1);
9079
9080 // Avoid adding defined bits with the zero_extend.
9081 if (Hi.isUndef()) {
9082 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
9083 SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo);
9084 return DAG.getNode(ISD::BITCAST, SL, VT, ExtLo);
9085 }
9086
9087 Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
9088 Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
9089
9090 SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
9091 DAG.getConstant(16, SL, MVT::i32));
9092 if (Lo.isUndef())
9093 return DAG.getNode(ISD::BITCAST, SL, VT, ShlHi);
9094
9095 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
9096 Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
9097
9098 SDValue Or =
9099 DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi, SDNodeFlags::Disjoint);
9100 return DAG.getNode(ISD::BITCAST, SL, VT, Or);
9101 }
9102
9103 // Split into 2-element chunks.
9104 const unsigned NumParts = VT.getVectorNumElements() / 2;
9105 EVT PartVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), 2);
9106 MVT PartIntVT = MVT::getIntegerVT(PartVT.getSizeInBits());
9107
9109 for (unsigned P = 0; P < NumParts; ++P) {
9110 SDValue Vec = DAG.getBuildVector(
9111 PartVT, SL, {Op.getOperand(P * 2), Op.getOperand(P * 2 + 1)});
9112 Casts.push_back(DAG.getNode(ISD::BITCAST, SL, PartIntVT, Vec));
9113 }
9114
9115 SDValue Blend =
9116 DAG.getBuildVector(MVT::getVectorVT(PartIntVT, NumParts), SL, Casts);
9117 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
9118}
9119
9121 const GlobalAddressSDNode *GA) const {
9122 // OSes that use ELF REL relocations (instead of RELA) can only store a
9123 // 32-bit addend in the instruction, so it is not safe to allow offset folding
9124 // which can create arbitrary 64-bit addends. (This is only a problem for
9125 // R_AMDGPU_*32_HI relocations since other relocation types are unaffected by
9126 // the high 32 bits of the addend.)
9127 //
9128 // This should be kept in sync with how HasRelocationAddend is initialized in
9129 // the constructor of ELFAMDGPUAsmBackend.
9130 if (!Subtarget->isAmdHsaOS())
9131 return false;
9132
9133 // We can fold offsets for anything that doesn't require a GOT relocation.
9134 return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
9138}
9139
9140static SDValue
9142 const SDLoc &DL, int64_t Offset, EVT PtrVT,
9143 unsigned GAFlags = SIInstrInfo::MO_NONE) {
9144 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
9145 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
9146 // lowered to the following code sequence:
9147 //
9148 // For constant address space:
9149 // s_getpc_b64 s[0:1]
9150 // s_add_u32 s0, s0, $symbol
9151 // s_addc_u32 s1, s1, 0
9152 //
9153 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
9154 // a fixup or relocation is emitted to replace $symbol with a literal
9155 // constant, which is a pc-relative offset from the encoding of the $symbol
9156 // operand to the global variable.
9157 //
9158 // For global address space:
9159 // s_getpc_b64 s[0:1]
9160 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
9161 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
9162 //
9163 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
9164 // fixups or relocations are emitted to replace $symbol@*@lo and
9165 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
9166 // which is a 64-bit pc-relative offset from the encoding of the $symbol
9167 // operand to the global variable.
9168 if (((const GCNSubtarget &)DAG.getSubtarget()).has64BitLiterals()) {
9169 assert(GAFlags != SIInstrInfo::MO_NONE);
9170
9171 SDValue Ptr =
9172 DAG.getTargetGlobalAddress(GV, DL, MVT::i64, Offset, GAFlags + 2);
9173 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET64, DL, PtrVT, Ptr);
9174 }
9175
9176 SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags);
9177 SDValue PtrHi;
9178 if (GAFlags == SIInstrInfo::MO_NONE)
9179 PtrHi = DAG.getTargetConstant(0, DL, MVT::i32);
9180 else
9181 PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags + 1);
9182 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
9183}
9184
9185SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
9186 SDValue Op,
9187 SelectionDAG &DAG) const {
9188 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
9189 SDLoc DL(GSD);
9190 EVT PtrVT = Op.getValueType();
9191
9192 const GlobalValue *GV = GSD->getGlobal();
9198 GV->hasExternalLinkage()) {
9199 const GlobalVariable &GVar = *cast<GlobalVariable>(GV);
9200 // HIP uses an unsized array `extern __shared__ T s[]` or similar
9201 // zero-sized type in other languages to declare the dynamic shared
9202 // memory which size is not known at the compile time. They will be
9203 // allocated by the runtime and placed directly after the static
9204 // allocated ones. They all share the same offset.
9205 if (GVar.getGlobalSize(GVar.getDataLayout()) == 0) {
9206 assert(PtrVT == MVT::i32 && "32-bit pointer is expected.");
9207 // Adjust alignment for that dynamic shared memory array.
9209 MFI->setDynLDSAlign(F, GVar);
9210 MFI->setUsesDynamicLDS(true);
9211 return SDValue(
9212 DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0);
9213 }
9214 }
9216 }
9217
9219 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(),
9221 return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA);
9222 }
9223
9224 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
9225 if (Subtarget->has64BitLiterals()) {
9227 GV, DL, MVT::i64, GSD->getOffset(), SIInstrInfo::MO_ABS64);
9228 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64, Addr),
9229 0);
9230 }
9231
9232 SDValue AddrLo = DAG.getTargetGlobalAddress(
9233 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_LO);
9234 AddrLo = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrLo), 0};
9235
9236 SDValue AddrHi = DAG.getTargetGlobalAddress(
9237 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_HI);
9238 AddrHi = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrHi), 0};
9239
9240 return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, AddrLo, AddrHi);
9241 }
9242
9243 if (shouldEmitFixup(GV))
9244 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
9245
9246 if (shouldEmitPCReloc(GV))
9247 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
9249
9250 SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
9252 PointerType *PtrTy =
9254 const DataLayout &DataLayout = DAG.getDataLayout();
9255 Align Alignment = DataLayout.getABITypeAlign(PtrTy);
9256 MachinePointerInfo PtrInfo =
9258
9259 return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Alignment,
9262}
9263
9264SDValue SITargetLowering::LowerExternalSymbol(SDValue Op,
9265 SelectionDAG &DAG) const {
9266 // TODO: Handle this. It should be mostly the same as LowerGlobalAddress.
9267 const Function &Fn = DAG.getMachineFunction().getFunction();
9268 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9269 Fn, "unsupported external symbol", Op.getDebugLoc()));
9270 return DAG.getPOISON(Op.getValueType());
9271}
9272
9274 const SDLoc &DL, SDValue V) const {
9275 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
9276 // the destination register.
9277 //
9278 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
9279 // so we will end up with redundant moves to m0.
9280 //
9281 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
9282
9283 // A Null SDValue creates a glue result.
9284 SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
9285 V, Chain);
9286 return SDValue(M0, 0);
9287}
9288
9289SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,
9290 MVT VT,
9291 unsigned Offset) const {
9292 SDLoc SL(Op);
9293 SDValue Param = lowerKernargMemParameter(
9294 DAG, MVT::i32, MVT::i32, SL, DAG.getEntryNode(), Offset, Align(4), false);
9295 // The local size values will have the hi 16-bits as zero.
9296 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
9297 DAG.getValueType(VT));
9298}
9299
9301 EVT VT) {
9304 "non-hsa intrinsic with hsa target", DL.getDebugLoc()));
9305 return DAG.getPOISON(VT);
9306}
9307
9309 EVT VT) {
9312 "intrinsic not supported on subtarget", DL.getDebugLoc()));
9313 return DAG.getPOISON(VT);
9314}
9315
9317 ArrayRef<SDValue> Elts) {
9318 assert(!Elts.empty());
9319 MVT Type;
9320 unsigned NumElts = Elts.size();
9321
9322 if (NumElts <= 12) {
9323 Type = MVT::getVectorVT(MVT::f32, NumElts);
9324 } else {
9325 assert(Elts.size() <= 16);
9326 Type = MVT::v16f32;
9327 NumElts = 16;
9328 }
9329
9330 SmallVector<SDValue, 16> VecElts(NumElts);
9331 for (unsigned i = 0; i < Elts.size(); ++i) {
9332 SDValue Elt = Elts[i];
9333 if (Elt.getValueType() != MVT::f32)
9334 Elt = DAG.getBitcast(MVT::f32, Elt);
9335 VecElts[i] = Elt;
9336 }
9337 for (unsigned i = Elts.size(); i < NumElts; ++i)
9338 VecElts[i] = DAG.getPOISON(MVT::f32);
9339
9340 if (NumElts == 1)
9341 return VecElts[0];
9342 return DAG.getBuildVector(Type, DL, VecElts);
9343}
9344
9345static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT,
9346 SDValue Src, int ExtraElts) {
9347 EVT SrcVT = Src.getValueType();
9348
9350
9351 if (SrcVT.isVector())
9352 DAG.ExtractVectorElements(Src, Elts);
9353 else
9354 Elts.push_back(Src);
9355
9356 SDValue Undef = DAG.getPOISON(SrcVT.getScalarType());
9357 while (ExtraElts--)
9358 Elts.push_back(Undef);
9359
9360 return DAG.getBuildVector(CastVT, DL, Elts);
9361}
9362
9363// Re-construct the required return value for a image load intrinsic.
9364// This is more complicated due to the optional use TexFailCtrl which means the
9365// required return type is an aggregate
9367 ArrayRef<EVT> ResultTypes, bool IsTexFail,
9368 bool Unpacked, bool IsD16, int DMaskPop,
9369 int NumVDataDwords, bool IsAtomicPacked16Bit,
9370 const SDLoc &DL) {
9371 // Determine the required return type. This is the same regardless of
9372 // IsTexFail flag
9373 EVT ReqRetVT = ResultTypes[0];
9374 int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
9375 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
9376 ? (ReqRetNumElts + 1) / 2
9377 : ReqRetNumElts;
9378
9379 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
9380
9381 MVT DataDwordVT =
9382 NumDataDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, NumDataDwords);
9383
9384 MVT MaskPopVT =
9385 MaskPopDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, MaskPopDwords);
9386
9387 SDValue Data(Result, 0);
9388 SDValue TexFail;
9389
9390 if (DMaskPop > 0 && Data.getValueType() != MaskPopVT) {
9391 SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32);
9392 if (MaskPopVT.isVector()) {
9393 Data = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskPopVT,
9394 SDValue(Result, 0), ZeroIdx);
9395 } else {
9396 Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskPopVT,
9397 SDValue(Result, 0), ZeroIdx);
9398 }
9399 }
9400
9401 if (DataDwordVT.isVector() && !IsAtomicPacked16Bit)
9402 Data = padEltsToUndef(DAG, DL, DataDwordVT, Data,
9403 NumDataDwords - MaskPopDwords);
9404
9405 if (IsD16)
9406 Data = adjustLoadValueTypeImpl(Data, ReqRetVT, DL, DAG, Unpacked);
9407
9408 EVT LegalReqRetVT = ReqRetVT;
9409 if (!ReqRetVT.isVector()) {
9410 if (!Data.getValueType().isInteger())
9411 Data = DAG.getNode(ISD::BITCAST, DL,
9412 Data.getValueType().changeTypeToInteger(), Data);
9413 Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data);
9414 } else {
9415 // We need to widen the return vector to a legal type
9416 if ((ReqRetVT.getVectorNumElements() % 2) == 1 &&
9417 ReqRetVT.getVectorElementType().getSizeInBits() == 16) {
9418 LegalReqRetVT =
9420 ReqRetVT.getVectorNumElements() + 1);
9421 }
9422 }
9423 Data = DAG.getNode(ISD::BITCAST, DL, LegalReqRetVT, Data);
9424
9425 if (IsTexFail) {
9426 TexFail =
9427 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SDValue(Result, 0),
9428 DAG.getConstant(MaskPopDwords, DL, MVT::i32));
9429
9430 return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL);
9431 }
9432
9433 if (Result->getNumValues() == 1)
9434 return Data;
9435
9436 return DAG.getMergeValues({Data, SDValue(Result, 1)}, DL);
9437}
9438
9439static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
9440 SDValue *LWE, bool &IsTexFail) {
9441 auto *TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.getNode());
9442
9443 uint64_t Value = TexFailCtrlConst->getZExtValue();
9444 if (Value) {
9445 IsTexFail = true;
9446 }
9447
9448 SDLoc DL(TexFailCtrlConst);
9449 *TFE = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
9450 Value &= ~(uint64_t)0x1;
9451 *LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
9452 Value &= ~(uint64_t)0x2;
9453
9454 return Value == 0;
9455}
9456
9458 MVT PackVectorVT,
9459 SmallVectorImpl<SDValue> &PackedAddrs,
9460 unsigned DimIdx, unsigned EndIdx,
9461 unsigned NumGradients) {
9462 SDLoc DL(Op);
9463 for (unsigned I = DimIdx; I < EndIdx; I++) {
9464 SDValue Addr = Op.getOperand(I);
9465
9466 // Gradients are packed with undef for each coordinate.
9467 // In <hi 16 bit>,<lo 16 bit> notation, the registers look like this:
9468 // 1D: undef,dx/dh; undef,dx/dv
9469 // 2D: dy/dh,dx/dh; dy/dv,dx/dv
9470 // 3D: dy/dh,dx/dh; undef,dz/dh; dy/dv,dx/dv; undef,dz/dv
9471 if (((I + 1) >= EndIdx) ||
9472 ((NumGradients / 2) % 2 == 1 && (I == DimIdx + (NumGradients / 2) - 1 ||
9473 I == DimIdx + NumGradients - 1))) {
9474 if (Addr.getValueType() != MVT::i16)
9475 Addr = DAG.getBitcast(MVT::i16, Addr);
9476 Addr = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Addr);
9477 } else {
9478 Addr = DAG.getBuildVector(PackVectorVT, DL, {Addr, Op.getOperand(I + 1)});
9479 I++;
9480 }
9481 Addr = DAG.getBitcast(MVT::f32, Addr);
9482 PackedAddrs.push_back(Addr);
9483 }
9484}
9485
9486SDValue SITargetLowering::lowerImage(SDValue Op,
9488 SelectionDAG &DAG, bool WithChain) const {
9489 SDLoc DL(Op);
9490 MachineFunction &MF = DAG.getMachineFunction();
9491 const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
9492 unsigned IntrOpcode = Intr->BaseOpcode;
9493 // For image atomic: use no-return opcode if result is unused.
9494 if (Intr->AtomicNoRetBaseOpcode != Intr->BaseOpcode &&
9495 !Op.getNode()->hasAnyUseOfValue(0))
9496 IntrOpcode = Intr->AtomicNoRetBaseOpcode;
9497 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
9499 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
9500 bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget);
9501 bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
9502 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
9503
9504 SmallVector<EVT, 3> ResultTypes(Op->values());
9505 SmallVector<EVT, 3> OrigResultTypes(Op->values());
9506 if (BaseOpcode->NoReturn && BaseOpcode->Atomic)
9507 ResultTypes.erase(&ResultTypes[0]);
9508
9509 bool IsD16 = false;
9510 bool IsG16 = false;
9511 bool IsA16 = false;
9512 SDValue VData;
9513 int NumVDataDwords = 0;
9514 bool AdjustRetType = false;
9515 bool IsAtomicPacked16Bit = false;
9516
9517 // Offset of intrinsic arguments
9518 const unsigned ArgOffset = WithChain ? 2 : 1;
9519
9520 unsigned DMask;
9521 unsigned DMaskLanes = 0;
9522
9523 if (BaseOpcode->Atomic) {
9524 VData = Op.getOperand(2);
9525
9526 IsAtomicPacked16Bit =
9527 (IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
9528 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16_NORTN ||
9529 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16 ||
9530 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16_NORTN);
9531
9532 bool Is64Bit = VData.getValueSizeInBits() == 64;
9533 if (BaseOpcode->AtomicX2) {
9534 SDValue VData2 = Op.getOperand(3);
9535 VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
9536 {VData, VData2});
9537 if (Is64Bit)
9538 VData = DAG.getBitcast(MVT::v4i32, VData);
9539
9540 if (!BaseOpcode->NoReturn)
9541 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
9542
9543 DMask = Is64Bit ? 0xf : 0x3;
9544 NumVDataDwords = Is64Bit ? 4 : 2;
9545 } else {
9546 DMask = Is64Bit ? 0x3 : 0x1;
9547 NumVDataDwords = Is64Bit ? 2 : 1;
9548 }
9549 } else {
9550 DMask = Op->getConstantOperandVal(ArgOffset + Intr->DMaskIndex);
9551 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
9552
9553 if (BaseOpcode->Store) {
9554 VData = Op.getOperand(2);
9555
9556 MVT StoreVT = VData.getSimpleValueType();
9557 if (StoreVT.getScalarType() == MVT::f16) {
9558 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
9559 return Op; // D16 is unsupported for this instruction
9560
9561 IsD16 = true;
9562 VData = handleD16VData(VData, DAG, true);
9563 }
9564
9565 NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
9566 } else if (!BaseOpcode->NoReturn) {
9567 // Work out the num dwords based on the dmask popcount and underlying type
9568 // and whether packing is supported.
9569 MVT LoadVT = ResultTypes[0].getSimpleVT();
9570 if (LoadVT.getScalarType() == MVT::f16) {
9571 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
9572 return Op; // D16 is unsupported for this instruction
9573
9574 IsD16 = true;
9575 }
9576
9577 // Confirm that the return type is large enough for the dmask specified
9578 if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) ||
9579 (!LoadVT.isVector() && DMaskLanes > 1))
9580 return Op;
9581
9582 // The sq block of gfx8 and gfx9 do not estimate register use correctly
9583 // for d16 image_gather4, image_gather4_l, and image_gather4_lz
9584 // instructions.
9585 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
9586 !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))
9587 NumVDataDwords = (DMaskLanes + 1) / 2;
9588 else
9589 NumVDataDwords = DMaskLanes;
9590
9591 AdjustRetType = true;
9592 }
9593 }
9594
9595 unsigned VAddrEnd = ArgOffset + Intr->VAddrEnd;
9597
9598 // Check for 16 bit addresses or derivatives and pack if true.
9599 MVT VAddrVT =
9600 Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType();
9601 MVT VAddrScalarVT = VAddrVT.getScalarType();
9602 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9603 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9604
9605 VAddrVT = Op.getOperand(ArgOffset + Intr->CoordStart).getSimpleValueType();
9606 VAddrScalarVT = VAddrVT.getScalarType();
9607 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9608 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9609
9610 // Push back extra arguments.
9611 for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) {
9612 if (IsA16 && (Op.getOperand(ArgOffset + I).getValueType() == MVT::f16)) {
9613 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
9614 // Special handling of bias when A16 is on. Bias is of type half but
9615 // occupies full 32-bit.
9616 SDValue Bias = DAG.getBuildVector(
9617 MVT::v2f16, DL,
9618 {Op.getOperand(ArgOffset + I), DAG.getPOISON(MVT::f16)});
9619 VAddrs.push_back(Bias);
9620 } else {
9621 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
9622 "Bias needs to be converted to 16 bit in A16 mode");
9623 VAddrs.push_back(Op.getOperand(ArgOffset + I));
9624 }
9625 }
9626
9627 if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) {
9628 // 16 bit gradients are supported, but are tied to the A16 control
9629 // so both gradients and addresses must be 16 bit
9630 LLVM_DEBUG(
9631 dbgs() << "Failed to lower image intrinsic: 16 bit addresses "
9632 "require 16 bit args for both gradients and addresses");
9633 return Op;
9634 }
9635
9636 if (IsA16) {
9637 if (!ST->hasA16()) {
9638 LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
9639 "support 16 bit addresses\n");
9640 return Op;
9641 }
9642 }
9643
9644 // We've dealt with incorrect input so we know that if IsA16, IsG16
9645 // are set then we have to compress/pack operands (either address,
9646 // gradient or both)
9647 // In the case where a16 and gradients are tied (no G16 support) then we
9648 // have already verified that both IsA16 and IsG16 are true
9649 if (BaseOpcode->Gradients && IsG16 && ST->hasG16()) {
9650 // Activate g16
9651 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
9653 IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16
9654 }
9655
9656 // Add gradients (packed or unpacked)
9657 if (IsG16) {
9658 // Pack the gradients
9659 // const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart);
9660 packImage16bitOpsToDwords(DAG, Op, GradPackVectorVT, VAddrs,
9661 ArgOffset + Intr->GradientStart,
9662 ArgOffset + Intr->CoordStart, Intr->NumGradients);
9663 } else {
9664 for (unsigned I = ArgOffset + Intr->GradientStart;
9665 I < ArgOffset + Intr->CoordStart; I++)
9666 VAddrs.push_back(Op.getOperand(I));
9667 }
9668
9669 // Add addresses (packed or unpacked)
9670 if (IsA16) {
9671 packImage16bitOpsToDwords(DAG, Op, AddrPackVectorVT, VAddrs,
9672 ArgOffset + Intr->CoordStart, VAddrEnd,
9673 0 /* No gradients */);
9674 } else {
9675 // Add uncompressed address
9676 for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++)
9677 VAddrs.push_back(Op.getOperand(I));
9678 }
9679
9680 // If the register allocator cannot place the address registers contiguously
9681 // without introducing moves, then using the non-sequential address encoding
9682 // is always preferable, since it saves VALU instructions and is usually a
9683 // wash in terms of code size or even better.
9684 //
9685 // However, we currently have no way of hinting to the register allocator that
9686 // MIMG addresses should be placed contiguously when it is possible to do so,
9687 // so force non-NSA for the common 2-address case as a heuristic.
9688 //
9689 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
9690 // allocation when possible.
9691 //
9692 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
9693 // set of the remaining addresses.
9694 const unsigned NSAMaxSize = ST->getNSAMaxSize(BaseOpcode->Sampler);
9695 const bool HasPartialNSAEncoding = ST->hasPartialNSAEncoding();
9696 const bool UseNSA = ST->hasNSAEncoding() &&
9697 VAddrs.size() >= ST->getNSAThreshold(MF) &&
9698 (VAddrs.size() <= NSAMaxSize || HasPartialNSAEncoding);
9699 const bool UsePartialNSA =
9700 UseNSA && HasPartialNSAEncoding && VAddrs.size() > NSAMaxSize;
9701
9702 SDValue VAddr;
9703 if (UsePartialNSA) {
9704 VAddr = getBuildDwordsVector(DAG, DL,
9705 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
9706 } else if (!UseNSA) {
9707 VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
9708 }
9709
9710 SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
9711 SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
9712 SDValue Unorm;
9713 if (!BaseOpcode->Sampler) {
9714 Unorm = True;
9715 } else {
9716 uint64_t UnormConst =
9717 Op.getConstantOperandVal(ArgOffset + Intr->UnormIndex);
9718
9719 Unorm = UnormConst ? True : False;
9720 }
9721
9722 SDValue TFE;
9723 SDValue LWE;
9724 SDValue TexFail = Op.getOperand(ArgOffset + Intr->TexFailCtrlIndex);
9725 bool IsTexFail = false;
9726 if (!parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
9727 return Op;
9728
9729 if (IsTexFail) {
9730 if (!DMaskLanes) {
9731 // Expecting to get an error flag since TFC is on - and dmask is 0
9732 // Force dmask to be at least 1 otherwise the instruction will fail
9733 DMask = 0x1;
9734 DMaskLanes = 1;
9735 NumVDataDwords = 1;
9736 }
9737 NumVDataDwords += 1;
9738 AdjustRetType = true;
9739 }
9740
9741 // Has something earlier tagged that the return type needs adjusting
9742 // This happens if the instruction is a load or has set TexFailCtrl flags
9743 if (AdjustRetType) {
9744 // NumVDataDwords reflects the true number of dwords required in the return
9745 // type
9746 if (DMaskLanes == 0 && !BaseOpcode->Store) {
9747 // This is a no-op load. This can be eliminated
9748 SDValue Undef = DAG.getPOISON(Op.getValueType());
9749 if (isa<MemSDNode>(Op))
9750 return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
9751 return Undef;
9752 }
9753
9754 EVT NewVT = NumVDataDwords > 1 ? EVT::getVectorVT(*DAG.getContext(),
9755 MVT::i32, NumVDataDwords)
9756 : MVT::i32;
9757
9758 ResultTypes[0] = NewVT;
9759 if (ResultTypes.size() == 3) {
9760 // Original result was aggregate type used for TexFailCtrl results
9761 // The actual instruction returns as a vector type which has now been
9762 // created. Remove the aggregate result.
9763 ResultTypes.erase(&ResultTypes[1]);
9764 }
9765 }
9766
9767 unsigned CPol = Op.getConstantOperandVal(ArgOffset + Intr->CachePolicyIndex);
9768 // Keep GLC only when the atomic's result is actually used.
9769 if (BaseOpcode->Atomic && !BaseOpcode->NoReturn)
9771 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
9773 return Op;
9774
9776 if (BaseOpcode->Store || BaseOpcode->Atomic)
9777 Ops.push_back(VData); // vdata
9778 if (UsePartialNSA) {
9779 append_range(Ops, ArrayRef(VAddrs).take_front(NSAMaxSize - 1));
9780 Ops.push_back(VAddr);
9781 } else if (UseNSA)
9782 append_range(Ops, VAddrs);
9783 else
9784 Ops.push_back(VAddr);
9785 SDValue Rsrc = Op.getOperand(ArgOffset + Intr->RsrcIndex);
9786 EVT RsrcVT = Rsrc.getValueType();
9787 if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32)
9788 return Op;
9789 Ops.push_back(Rsrc);
9790 if (BaseOpcode->Sampler) {
9791 SDValue Samp = Op.getOperand(ArgOffset + Intr->SampIndex);
9792 if (Samp.getValueType() != MVT::v4i32)
9793 return Op;
9794 Ops.push_back(Samp);
9795 }
9796 Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
9797 if (IsGFX10Plus)
9798 Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32));
9799 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
9800 Ops.push_back(Unorm);
9801 Ops.push_back(DAG.getTargetConstant(CPol, DL, MVT::i32));
9802 Ops.push_back(IsA16 && // r128, a16 for gfx9
9803 ST->hasFeature(AMDGPU::FeatureR128A16)
9804 ? True
9805 : False);
9806 if (IsGFX10Plus)
9807 Ops.push_back(IsA16 ? True : False);
9808
9809 if (!Subtarget->hasGFX90AInsts())
9810 Ops.push_back(TFE); // tfe
9811 else if (TFE->getAsZExtVal()) {
9812 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9814 "TFE is not supported on this GPU", DL.getDebugLoc()));
9815 }
9816
9817 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
9818 Ops.push_back(LWE); // lwe
9819 if (!IsGFX10Plus)
9820 Ops.push_back(DimInfo->DA ? True : False);
9821 if (BaseOpcode->HasD16)
9822 Ops.push_back(IsD16 ? True : False);
9823 if (isa<MemSDNode>(Op))
9824 Ops.push_back(Op.getOperand(0)); // chain
9825
9826 int NumVAddrDwords =
9827 UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32;
9828 int Opcode = -1;
9829
9830 if (IsGFX12Plus) {
9831 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
9832 NumVDataDwords, NumVAddrDwords);
9833 } else if (IsGFX11Plus) {
9834 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
9835 UseNSA ? AMDGPU::MIMGEncGfx11NSA
9836 : AMDGPU::MIMGEncGfx11Default,
9837 NumVDataDwords, NumVAddrDwords);
9838 } else if (IsGFX10Plus) {
9839 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
9840 UseNSA ? AMDGPU::MIMGEncGfx10NSA
9841 : AMDGPU::MIMGEncGfx10Default,
9842 NumVDataDwords, NumVAddrDwords);
9843 } else {
9844 if (Subtarget->hasGFX90AInsts()) {
9845 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
9846 NumVDataDwords, NumVAddrDwords);
9847 if (Opcode == -1) {
9848 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9850 "requested image instruction is not supported on this GPU",
9851 DL.getDebugLoc()));
9852
9853 unsigned Idx = 0;
9854 SmallVector<SDValue, 3> RetValues(OrigResultTypes.size());
9855 for (EVT VT : OrigResultTypes) {
9856 if (VT == MVT::Other)
9857 RetValues[Idx++] = Op.getOperand(0); // Chain
9858 else
9859 RetValues[Idx++] = DAG.getPOISON(VT);
9860 }
9861
9862 return DAG.getMergeValues(RetValues, DL);
9863 }
9864 }
9865 if (Opcode == -1 &&
9866 Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9867 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
9868 NumVDataDwords, NumVAddrDwords);
9869 if (Opcode == -1)
9870 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
9871 NumVDataDwords, NumVAddrDwords);
9872 }
9873 if (Opcode == -1)
9874 return Op;
9875
9876 MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops);
9877 if (auto *MemOp = dyn_cast<MemSDNode>(Op)) {
9878 MachineMemOperand *MemRef = MemOp->getMemOperand();
9879 DAG.setNodeMemRefs(NewNode, {MemRef});
9880 }
9881
9882 if (BaseOpcode->NoReturn) {
9883 if (BaseOpcode->Atomic)
9884 return DAG.getMergeValues(
9885 {DAG.getPOISON(OrigResultTypes[0]), SDValue(NewNode, 0)}, DL);
9886
9887 return SDValue(NewNode, 0);
9888 }
9889
9890 if (BaseOpcode->AtomicX2) {
9892 DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
9893 return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
9894 }
9895
9896 return constructRetValue(DAG, NewNode, OrigResultTypes, IsTexFail,
9897 Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
9898 NumVDataDwords, IsAtomicPacked16Bit, DL);
9899}
9900
9901SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
9902 SDValue Offset, SDValue CachePolicy,
9903 SelectionDAG &DAG) const {
9904 MachineFunction &MF = DAG.getMachineFunction();
9905
9906 const DataLayout &DataLayout = DAG.getDataLayout();
9907 Align Alignment =
9908 DataLayout.getABITypeAlign(VT.getTypeForEVT(*DAG.getContext()));
9909
9910 MachineMemOperand *MMO = MF.getMachineMemOperand(
9911 MachinePointerInfo(),
9914 VT.getStoreSize(), Alignment);
9915
9916 if (!Offset->isDivergent()) {
9917 SDValue Ops[] = {Rsrc, Offset, CachePolicy};
9918
9919 // Lower llvm.amdgcn.s.buffer.load.{i16, u16} intrinsics. Initially, the
9920 // s_buffer_load_u16 instruction is emitted for both signed and unsigned
9921 // loads. Later, DAG combiner tries to combine s_buffer_load_u16 with sext
9922 // and generates s_buffer_load_i16 (performSignExtendInRegCombine).
9923 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
9924 SDValue BufferLoad =
9925 DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD_USHORT, DL,
9926 DAG.getVTList(MVT::i32), Ops, VT, MMO);
9927 return DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
9928 }
9929
9930 // Widen vec3 load to vec4.
9931 if (VT.isVector() && VT.getVectorNumElements() == 3 &&
9932 !Subtarget->hasScalarDwordx3Loads()) {
9933 EVT WidenedVT =
9935 auto WidenedOp = DAG.getMemIntrinsicNode(
9936 AMDGPUISD::SBUFFER_LOAD, DL, DAG.getVTList(WidenedVT), Ops, WidenedVT,
9937 MF.getMachineMemOperand(MMO, 0, WidenedVT.getStoreSize()));
9938 auto Subvector = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, WidenedOp,
9939 DAG.getVectorIdxConstant(0, DL));
9940 return Subvector;
9941 }
9942
9943 return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL,
9944 DAG.getVTList(VT), Ops, VT, MMO);
9945 }
9946
9947 // We have a divergent offset. Emit a MUBUF buffer load instead. We can
9948 // assume that the buffer is unswizzled.
9949 SDValue Ops[] = {
9950 DAG.getEntryNode(), // Chain
9951 Rsrc, // rsrc
9952 DAG.getConstant(0, DL, MVT::i32), // vindex
9953 {}, // voffset
9954 {}, // soffset
9955 {}, // offset
9956 CachePolicy, // cachepolicy
9957 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9958 };
9959 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
9960 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
9961 return handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
9962 }
9963
9965 unsigned NumLoads = 1;
9966 MVT LoadVT = VT.getSimpleVT();
9967 unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
9968 assert((LoadVT.getScalarType() == MVT::i32 ||
9969 LoadVT.getScalarType() == MVT::f32));
9970
9971 if (NumElts == 8 || NumElts == 16) {
9972 NumLoads = NumElts / 4;
9973 LoadVT = MVT::getVectorVT(LoadVT.getScalarType(), 4);
9974 }
9975
9976 SDVTList VTList = DAG.getVTList({LoadVT, MVT::Other});
9977
9978 // Use the alignment to ensure that the required offsets will fit into the
9979 // immediate offsets.
9980 setBufferOffsets(Offset, DAG, &Ops[3],
9981 NumLoads > 1 ? Align(16 * NumLoads) : Align(4));
9982
9983 uint64_t InstOffset = Ops[5]->getAsZExtVal();
9984 for (unsigned i = 0; i < NumLoads; ++i) {
9985 Ops[5] = DAG.getTargetConstant(InstOffset + 16 * i, DL, MVT::i32);
9986 Loads.push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops,
9987 LoadVT, MMO, DAG));
9988 }
9989
9990 if (NumElts == 8 || NumElts == 16)
9991 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads);
9992
9993 return Loads[0];
9994}
9995
9996SDValue SITargetLowering::lowerWaveID(SelectionDAG &DAG, SDValue Op) const {
9997 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
9998 if (!Subtarget->hasArchitectedSGPRs())
9999 return {};
10000 SDLoc SL(Op);
10001 MVT VT = MVT::i32;
10002 SDValue TTMP8 = DAG.getCopyFromReg(DAG.getEntryNode(), SL, AMDGPU::TTMP8, VT);
10003 return DAG.getNode(AMDGPUISD::BFE_U32, SL, VT, TTMP8,
10004 DAG.getConstant(25, SL, VT), DAG.getConstant(5, SL, VT));
10005}
10006
10007SDValue SITargetLowering::lowerConstHwRegRead(SelectionDAG &DAG, SDValue Op,
10008 AMDGPU::Hwreg::Id HwReg,
10009 unsigned LowBit,
10010 unsigned Width) const {
10011 SDLoc SL(Op);
10012 using namespace AMDGPU::Hwreg;
10013 return {DAG.getMachineNode(
10014 AMDGPU::S_GETREG_B32_const, SL, MVT::i32,
10015 DAG.getTargetConstant(HwregEncoding::encode(HwReg, LowBit, Width),
10016 SL, MVT::i32)),
10017 0};
10018}
10019
10020SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op,
10021 unsigned Dim,
10022 const ArgDescriptor &Arg) const {
10023 SDLoc SL(Op);
10024 MachineFunction &MF = DAG.getMachineFunction();
10025 unsigned MaxID = Subtarget->getMaxWorkitemID(MF.getFunction(), Dim);
10026 if (MaxID == 0)
10027 return DAG.getConstant(0, SL, MVT::i32);
10028
10029 // It's undefined behavior if a function marked with the amdgpu-no-*
10030 // attributes uses the corresponding intrinsic.
10031 if (!Arg)
10032 return DAG.getPOISON(Op->getValueType(0));
10033
10034 SDValue Val = loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
10035 SDLoc(DAG.getEntryNode()), Arg);
10036
10037 // Don't bother inserting AssertZext for packed IDs since we're emitting the
10038 // masking operations anyway.
10039 //
10040 // TODO: We could assert the top bit is 0 for the source copy.
10041 if (Arg.isMasked())
10042 return Val;
10043
10044 // Preserve the known bits after expansion to a copy.
10045 EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), llvm::bit_width(MaxID));
10046 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Val,
10047 DAG.getValueType(SmallVT));
10048}
10049
10050SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
10051 SelectionDAG &DAG) const {
10052 MachineFunction &MF = DAG.getMachineFunction();
10053 auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
10054
10055 EVT VT = Op.getValueType();
10056 SDLoc DL(Op);
10057 unsigned IntrinsicID = Op.getConstantOperandVal(0);
10058
10059 // TODO: Should this propagate fast-math-flags?
10060
10061 switch (IntrinsicID) {
10062 case Intrinsic::amdgcn_implicit_buffer_ptr: {
10063 if (getSubtarget()->isAmdHsaOrMesa(MF.getFunction()))
10064 return emitNonHSAIntrinsicError(DAG, DL, VT);
10065 return getPreloadedValue(DAG, *MFI, VT,
10067 }
10068 case Intrinsic::amdgcn_dispatch_ptr:
10069 case Intrinsic::amdgcn_queue_ptr: {
10070 if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) {
10071 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10072 MF.getFunction(), "unsupported hsa intrinsic without hsa target",
10073 DL.getDebugLoc()));
10074 return DAG.getPOISON(VT);
10075 }
10076
10077 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr
10080 return getPreloadedValue(DAG, *MFI, VT, RegID);
10081 }
10082 case Intrinsic::amdgcn_implicitarg_ptr: {
10083 if (MFI->isEntryFunction())
10084 return getImplicitArgPtr(DAG, DL);
10085 return getPreloadedValue(DAG, *MFI, VT,
10087 }
10088 case Intrinsic::amdgcn_kernarg_segment_ptr: {
10089 if (!AMDGPU::isKernel(MF.getFunction())) {
10090 // This only makes sense to call in a kernel, so just lower to null.
10091 return DAG.getConstant(0, DL, VT);
10092 }
10093
10094 return getPreloadedValue(DAG, *MFI, VT,
10096 }
10097 case Intrinsic::amdgcn_dispatch_id: {
10098 return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);
10099 }
10100 case Intrinsic::amdgcn_rcp:
10101 return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
10102 case Intrinsic::amdgcn_rsq:
10103 return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
10104 case Intrinsic::amdgcn_rsq_legacy:
10105 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
10106 return emitRemovedIntrinsicError(DAG, DL, VT);
10107 return SDValue();
10108 case Intrinsic::amdgcn_rcp_legacy:
10109 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
10110 return emitRemovedIntrinsicError(DAG, DL, VT);
10111 return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
10112 case Intrinsic::amdgcn_rsq_clamp: {
10113 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
10114 return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
10115
10116 Type *Type = VT.getTypeForEVT(*DAG.getContext());
10117 APFloat Max = APFloat::getLargest(Type->getFltSemantics());
10118 APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true);
10119
10120 SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
10121 SDValue Tmp =
10122 DAG.getNode(ISD::FMINNUM, DL, VT, Rsq, DAG.getConstantFP(Max, DL, VT));
10123 return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
10124 DAG.getConstantFP(Min, DL, VT));
10125 }
10126 case Intrinsic::r600_read_ngroups_x:
10127 if (Subtarget->isAmdHsaOS())
10128 return emitNonHSAIntrinsicError(DAG, DL, VT);
10129
10130 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
10132 false);
10133 case Intrinsic::r600_read_ngroups_y:
10134 if (Subtarget->isAmdHsaOS())
10135 return emitNonHSAIntrinsicError(DAG, DL, VT);
10136
10137 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
10139 false);
10140 case Intrinsic::r600_read_ngroups_z:
10141 if (Subtarget->isAmdHsaOS())
10142 return emitNonHSAIntrinsicError(DAG, DL, VT);
10143
10144 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
10146 false);
10147 case Intrinsic::r600_read_local_size_x:
10148 if (Subtarget->isAmdHsaOS())
10149 return emitNonHSAIntrinsicError(DAG, DL, VT);
10150
10151 return lowerImplicitZextParam(DAG, Op, MVT::i16,
10153 case Intrinsic::r600_read_local_size_y:
10154 if (Subtarget->isAmdHsaOS())
10155 return emitNonHSAIntrinsicError(DAG, DL, VT);
10156
10157 return lowerImplicitZextParam(DAG, Op, MVT::i16,
10159 case Intrinsic::r600_read_local_size_z:
10160 if (Subtarget->isAmdHsaOS())
10161 return emitNonHSAIntrinsicError(DAG, DL, VT);
10162
10163 return lowerImplicitZextParam(DAG, Op, MVT::i16,
10165 case Intrinsic::amdgcn_workgroup_id_x:
10166 return lowerWorkGroupId(DAG, *MFI, VT,
10170 case Intrinsic::amdgcn_workgroup_id_y:
10171 return lowerWorkGroupId(DAG, *MFI, VT,
10175 case Intrinsic::amdgcn_workgroup_id_z:
10176 return lowerWorkGroupId(DAG, *MFI, VT,
10180 case Intrinsic::amdgcn_cluster_id_x:
10181 return Subtarget->hasClusters()
10182 ? getPreloadedValue(DAG, *MFI, VT,
10184 : DAG.getPOISON(VT);
10185 case Intrinsic::amdgcn_cluster_id_y:
10186 return Subtarget->hasClusters()
10187 ? getPreloadedValue(DAG, *MFI, VT,
10189 : DAG.getPOISON(VT);
10190 case Intrinsic::amdgcn_cluster_id_z:
10191 return Subtarget->hasClusters()
10192 ? getPreloadedValue(DAG, *MFI, VT,
10194 : DAG.getPOISON(VT);
10195 case Intrinsic::amdgcn_cluster_workgroup_id_x:
10196 return Subtarget->hasClusters()
10197 ? getPreloadedValue(
10198 DAG, *MFI, VT,
10200 : DAG.getPOISON(VT);
10201 case Intrinsic::amdgcn_cluster_workgroup_id_y:
10202 return Subtarget->hasClusters()
10203 ? getPreloadedValue(
10204 DAG, *MFI, VT,
10206 : DAG.getPOISON(VT);
10207 case Intrinsic::amdgcn_cluster_workgroup_id_z:
10208 return Subtarget->hasClusters()
10209 ? getPreloadedValue(
10210 DAG, *MFI, VT,
10212 : DAG.getPOISON(VT);
10213 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
10214 return Subtarget->hasClusters()
10215 ? lowerConstHwRegRead(DAG, Op, AMDGPU::Hwreg::ID_IB_STS2, 21, 4)
10216 : SDValue();
10217 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
10218 return Subtarget->hasClusters()
10219 ? getPreloadedValue(
10220 DAG, *MFI, VT,
10222 : DAG.getPOISON(VT);
10223 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
10224 return Subtarget->hasClusters()
10225 ? getPreloadedValue(
10226 DAG, *MFI, VT,
10228 : DAG.getPOISON(VT);
10229 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
10230 return Subtarget->hasClusters()
10231 ? getPreloadedValue(
10232 DAG, *MFI, VT,
10234 : DAG.getPOISON(VT);
10235 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
10236 return Subtarget->hasClusters()
10237 ? getPreloadedValue(
10238 DAG, *MFI, VT,
10240 : DAG.getPOISON(VT);
10241 case Intrinsic::amdgcn_wave_id:
10242 return lowerWaveID(DAG, Op);
10243 case Intrinsic::amdgcn_lds_kernel_id: {
10244 if (MFI->isEntryFunction())
10245 return getLDSKernelId(DAG, DL);
10246 return getPreloadedValue(DAG, *MFI, VT,
10248 }
10249 case Intrinsic::amdgcn_workitem_id_x:
10250 return lowerWorkitemID(DAG, Op, 0, MFI->getArgInfo().WorkItemIDX);
10251 case Intrinsic::amdgcn_workitem_id_y:
10252 return lowerWorkitemID(DAG, Op, 1, MFI->getArgInfo().WorkItemIDY);
10253 case Intrinsic::amdgcn_workitem_id_z:
10254 return lowerWorkitemID(DAG, Op, 2, MFI->getArgInfo().WorkItemIDZ);
10255 case Intrinsic::amdgcn_wavefrontsize:
10256 return DAG.getConstant(MF.getSubtarget<GCNSubtarget>().getWavefrontSize(),
10257 SDLoc(Op), MVT::i32);
10258 case Intrinsic::amdgcn_s_buffer_load: {
10259 unsigned CPol = Op.getConstantOperandVal(3);
10260 // s_buffer_load, because of how it's optimized, can't be volatile
10261 // so reject ones with the volatile bit set.
10262 if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12)
10265 return Op;
10266 return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2),
10267 Op.getOperand(3), DAG);
10268 }
10269 case Intrinsic::amdgcn_fdiv_fast:
10270 return lowerFDIV_FAST(Op, DAG);
10271 case Intrinsic::amdgcn_sin:
10272 return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
10273
10274 case Intrinsic::amdgcn_cos:
10275 return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
10276
10277 case Intrinsic::amdgcn_mul_u24:
10278 return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, Op.getOperand(1),
10279 Op.getOperand(2));
10280 case Intrinsic::amdgcn_mul_i24:
10281 return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT, Op.getOperand(1),
10282 Op.getOperand(2));
10283
10284 case Intrinsic::amdgcn_log_clamp: {
10285 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
10286 return SDValue();
10287
10288 return emitRemovedIntrinsicError(DAG, DL, VT);
10289 }
10290 case Intrinsic::amdgcn_fract:
10291 return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
10292
10293 case Intrinsic::amdgcn_class:
10294 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT, Op.getOperand(1),
10295 Op.getOperand(2));
10296 case Intrinsic::amdgcn_div_fmas:
10297 return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT, Op.getOperand(1),
10298 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
10299
10300 case Intrinsic::amdgcn_div_fixup:
10301 return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT, Op.getOperand(1),
10302 Op.getOperand(2), Op.getOperand(3));
10303
10304 case Intrinsic::amdgcn_div_scale: {
10305 const ConstantSDNode *Param = cast<ConstantSDNode>(Op.getOperand(3));
10306
10307 // Translate to the operands expected by the machine instruction. The
10308 // first parameter must be the same as the first instruction.
10309 SDValue Numerator = Op.getOperand(1);
10310 SDValue Denominator = Op.getOperand(2);
10311
10312 // Note this order is opposite of the machine instruction's operations,
10313 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
10314 // intrinsic has the numerator as the first operand to match a normal
10315 // division operation.
10316
10317 SDValue Src0 = Param->isAllOnes() ? Numerator : Denominator;
10318
10319 return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
10320 Denominator, Numerator);
10321 }
10322 case Intrinsic::amdgcn_icmp: {
10323 // There is a Pat that handles this variant, so return it as-is.
10324 if (Op.getOperand(1).getValueType() == MVT::i1 &&
10325 Op.getConstantOperandVal(2) == 0 &&
10326 Op.getConstantOperandVal(3) == ICmpInst::Predicate::ICMP_NE)
10327 return Op;
10328 return lowerICMPIntrinsic(*this, Op.getNode(), DAG);
10329 }
10330 case Intrinsic::amdgcn_fcmp: {
10331 return lowerFCMPIntrinsic(*this, Op.getNode(), DAG);
10332 }
10333 case Intrinsic::amdgcn_ballot:
10334 return lowerBALLOTIntrinsic(*this, Op.getNode(), DAG);
10335 case Intrinsic::amdgcn_fmed3:
10336 return DAG.getNode(AMDGPUISD::FMED3, DL, VT, Op.getOperand(1),
10337 Op.getOperand(2), Op.getOperand(3));
10338 case Intrinsic::amdgcn_fdot2:
10339 return DAG.getNode(AMDGPUISD::FDOT2, DL, VT, Op.getOperand(1),
10340 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
10341 case Intrinsic::amdgcn_fmul_legacy:
10342 return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT, Op.getOperand(1),
10343 Op.getOperand(2));
10344 case Intrinsic::amdgcn_sffbh:
10345 return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
10346 case Intrinsic::amdgcn_sbfe:
10347 return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT, Op.getOperand(1),
10348 Op.getOperand(2), Op.getOperand(3));
10349 case Intrinsic::amdgcn_ubfe:
10350 return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT, Op.getOperand(1),
10351 Op.getOperand(2), Op.getOperand(3));
10352 case Intrinsic::amdgcn_cvt_pkrtz:
10353 case Intrinsic::amdgcn_cvt_pknorm_i16:
10354 case Intrinsic::amdgcn_cvt_pknorm_u16:
10355 case Intrinsic::amdgcn_cvt_pk_i16:
10356 case Intrinsic::amdgcn_cvt_pk_u16: {
10357 // FIXME: Stop adding cast if v2f16/v2i16 are legal.
10358 EVT VT = Op.getValueType();
10359 unsigned Opcode;
10360
10361 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
10362 Opcode = AMDGPUISD::CVT_PKRTZ_F16_F32;
10363 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
10364 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
10365 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
10366 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
10367 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
10368 Opcode = AMDGPUISD::CVT_PK_I16_I32;
10369 else
10370 Opcode = AMDGPUISD::CVT_PK_U16_U32;
10371
10372 if (isTypeLegal(VT))
10373 return DAG.getNode(Opcode, DL, VT, Op.getOperand(1), Op.getOperand(2));
10374
10375 SDValue Node =
10376 DAG.getNode(Opcode, DL, MVT::i32, Op.getOperand(1), Op.getOperand(2));
10377 return DAG.getNode(ISD::BITCAST, DL, VT, Node);
10378 }
10379 case Intrinsic::amdgcn_fmad_ftz:
10380 return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),
10381 Op.getOperand(2), Op.getOperand(3));
10382
10383 case Intrinsic::amdgcn_if_break:
10384 return SDValue(DAG.getMachineNode(AMDGPU::SI_IF_BREAK, DL, VT,
10385 Op->getOperand(1), Op->getOperand(2)),
10386 0);
10387
10388 case Intrinsic::amdgcn_groupstaticsize: {
10390 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
10391 return Op;
10392
10393 const Module *M = MF.getFunction().getParent();
10394 const GlobalValue *GV =
10395 Intrinsic::getDeclarationIfExists(M, Intrinsic::amdgcn_groupstaticsize);
10396 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0,
10398 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
10399 }
10400 case Intrinsic::amdgcn_is_shared:
10401 case Intrinsic::amdgcn_is_private: {
10402 SDLoc SL(Op);
10403 SDValue SrcVec =
10404 DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
10405 SDValue SrcHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, SrcVec,
10406 DAG.getConstant(1, SL, MVT::i32));
10407
10408 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
10410 : AMDGPUAS::PRIVATE_ADDRESS;
10411 if (AS == AMDGPUAS::PRIVATE_ADDRESS &&
10412 Subtarget->hasGloballyAddressableScratch()) {
10413 SDValue FlatScratchBaseHi(
10414 DAG.getMachineNode(
10415 AMDGPU::S_MOV_B32, DL, MVT::i32,
10416 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, MVT::i32)),
10417 0);
10418 // Test bits 63..58 against the aperture address.
10419 return DAG.getSetCC(
10420 SL, MVT::i1,
10421 DAG.getNode(ISD::XOR, SL, MVT::i32, SrcHi, FlatScratchBaseHi),
10422 DAG.getConstant(1u << 26, SL, MVT::i32), ISD::SETULT);
10423 }
10424
10425 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
10426 return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ);
10427 }
10428 case Intrinsic::amdgcn_perm:
10429 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1),
10430 Op.getOperand(2), Op.getOperand(3));
10431 case Intrinsic::amdgcn_reloc_constant: {
10432 Module *M = MF.getFunction().getParent();
10433 const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD();
10434 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
10435 auto *RelocSymbol = cast<GlobalVariable>(
10436 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
10437 SDValue GA = DAG.getTargetGlobalAddress(RelocSymbol, DL, MVT::i32, 0,
10439 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
10440 }
10441 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
10442 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
10443 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
10444 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
10445 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
10446 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
10447 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
10448 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
10449 if (Op.getOperand(4).getValueType() == MVT::i32)
10450 return SDValue();
10451
10452 SDLoc SL(Op);
10453 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i32);
10454 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
10455 Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10456 Op.getOperand(3), IndexKeyi32);
10457 }
10458 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
10459 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
10460 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
10461 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
10462 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
10463 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
10464 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
10465 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
10466 if (Op.getOperand(4).getValueType() == MVT::i64)
10467 return SDValue();
10468
10469 SDLoc SL(Op);
10470 auto IndexKeyi64 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i64);
10471 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
10472 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10473 Op.getOperand(3), IndexKeyi64, Op.getOperand(5),
10474 Op.getOperand(6)});
10475 }
10476 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
10477 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
10478 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
10479 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
10480 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
10481 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8: {
10482 EVT IndexKeyTy = IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
10483 ? MVT::i64
10484 : MVT::i32;
10485 if (Op.getOperand(6).getValueType() == IndexKeyTy)
10486 return SDValue();
10487
10488 SDLoc SL(Op);
10489 auto IndexKey = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, IndexKeyTy);
10491 Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10492 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
10493 IndexKey, Op.getOperand(7), Op.getOperand(8)};
10494 if (IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8)
10495 Args.push_back(Op.getOperand(9));
10496 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(), Args);
10497 }
10498 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
10499 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
10500 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
10501 if (Op.getOperand(6).getValueType() == MVT::i32)
10502 return SDValue();
10503
10504 SDLoc SL(Op);
10505 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, MVT::i32);
10506 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
10507 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10508 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
10509 IndexKeyi32, Op.getOperand(7)});
10510 }
10511 case Intrinsic::amdgcn_addrspacecast_nonnull:
10512 return lowerADDRSPACECAST(Op, DAG);
10513 case Intrinsic::amdgcn_readlane:
10514 case Intrinsic::amdgcn_readfirstlane:
10515 case Intrinsic::amdgcn_writelane:
10516 case Intrinsic::amdgcn_permlane16:
10517 case Intrinsic::amdgcn_permlanex16:
10518 case Intrinsic::amdgcn_permlane64:
10519 case Intrinsic::amdgcn_set_inactive:
10520 case Intrinsic::amdgcn_set_inactive_chain_arg:
10521 case Intrinsic::amdgcn_mov_dpp8:
10522 case Intrinsic::amdgcn_update_dpp:
10523 return lowerLaneOp(*this, Op.getNode(), DAG);
10524 case Intrinsic::amdgcn_dead: {
10526 for (const EVT ValTy : Op.getNode()->values())
10527 Poisons.push_back(DAG.getPOISON(ValTy));
10528 return DAG.getMergeValues(Poisons, SDLoc(Op));
10529 }
10530 case Intrinsic::amdgcn_wave_shuffle:
10531 return lowerWaveShuffle(*this, Op.getNode(), DAG);
10532 default:
10533 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10535 return lowerImage(Op, ImageDimIntr, DAG, false);
10536
10537 return Op;
10538 }
10539}
10540
10541// On targets not supporting constant in soffset field, turn zero to
10542// SGPR_NULL to avoid generating an extra s_mov with zero.
10544 const GCNSubtarget *Subtarget) {
10545 if (Subtarget->hasRestrictedSOffset() && isNullConstant(SOffset))
10546 return DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32);
10547 return SOffset;
10548}
10549
10550SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
10551 SelectionDAG &DAG,
10552 unsigned NewOpcode) const {
10553 SDLoc DL(Op);
10554
10555 SDValue VData = Op.getOperand(2);
10556 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
10557 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
10558 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
10559 SDValue Ops[] = {
10560 Op.getOperand(0), // Chain
10561 VData, // vdata
10562 Rsrc, // rsrc
10563 DAG.getConstant(0, DL, MVT::i32), // vindex
10564 VOffset, // voffset
10565 SOffset, // soffset
10566 Offset, // offset
10567 Op.getOperand(6), // cachepolicy
10568 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10569 };
10570
10571 auto *M = cast<MemSDNode>(Op);
10572
10573 EVT MemVT = VData.getValueType();
10574 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
10575 M->getMemOperand());
10576}
10577
10578SDValue
10579SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
10580 unsigned NewOpcode) const {
10581 SDLoc DL(Op);
10582
10583 SDValue VData = Op.getOperand(2);
10584 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
10585 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
10586 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
10587 SDValue Ops[] = {
10588 Op.getOperand(0), // Chain
10589 VData, // vdata
10590 Rsrc, // rsrc
10591 Op.getOperand(4), // vindex
10592 VOffset, // voffset
10593 SOffset, // soffset
10594 Offset, // offset
10595 Op.getOperand(7), // cachepolicy
10596 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10597 };
10598
10599 auto *M = cast<MemSDNode>(Op);
10600
10601 EVT MemVT = VData.getValueType();
10602 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
10603 M->getMemOperand());
10604}
10605
10606SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
10607 SelectionDAG &DAG) const {
10608 unsigned IntrID = Op.getConstantOperandVal(1);
10609 SDLoc DL(Op);
10610
10611 switch (IntrID) {
10612 case Intrinsic::amdgcn_ds_ordered_add:
10613 case Intrinsic::amdgcn_ds_ordered_swap: {
10614 MemSDNode *M = cast<MemSDNode>(Op);
10615 SDValue Chain = M->getOperand(0);
10616 SDValue M0 = M->getOperand(2);
10617 SDValue Value = M->getOperand(3);
10618 unsigned IndexOperand = M->getConstantOperandVal(7);
10619 unsigned WaveRelease = M->getConstantOperandVal(8);
10620 unsigned WaveDone = M->getConstantOperandVal(9);
10621
10622 unsigned OrderedCountIndex = IndexOperand & 0x3f;
10623 IndexOperand &= ~0x3f;
10624 unsigned CountDw = 0;
10625
10626 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) {
10627 CountDw = (IndexOperand >> 24) & 0xf;
10628 IndexOperand &= ~(0xf << 24);
10629
10630 if (CountDw < 1 || CountDw > 4) {
10631 const Function &Fn = DAG.getMachineFunction().getFunction();
10632 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10633 Fn, "ds_ordered_count: dword count must be between 1 and 4",
10634 DL.getDebugLoc()));
10635 CountDw = 1;
10636 }
10637 }
10638
10639 if (IndexOperand) {
10640 const Function &Fn = DAG.getMachineFunction().getFunction();
10641 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10642 Fn, "ds_ordered_count: bad index operand", DL.getDebugLoc()));
10643 }
10644
10645 if (WaveDone && !WaveRelease) {
10646 // TODO: Move this to IR verifier
10647 const Function &Fn = DAG.getMachineFunction().getFunction();
10648 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10649 Fn, "ds_ordered_count: wave_done requires wave_release",
10650 DL.getDebugLoc()));
10651 }
10652
10653 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
10654 unsigned ShaderType =
10656 unsigned Offset0 = OrderedCountIndex << 2;
10657 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
10658
10659 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10)
10660 Offset1 |= (CountDw - 1) << 6;
10661
10662 if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX11)
10663 Offset1 |= ShaderType << 2;
10664
10665 unsigned Offset = Offset0 | (Offset1 << 8);
10666
10667 SDValue Ops[] = {
10668 Chain, Value, DAG.getTargetConstant(Offset, DL, MVT::i16),
10669 copyToM0(DAG, Chain, DL, M0).getValue(1), // Glue
10670 };
10671 return DAG.getMemIntrinsicNode(AMDGPUISD::DS_ORDERED_COUNT, DL,
10672 M->getVTList(), Ops, M->getMemoryVT(),
10673 M->getMemOperand());
10674 }
10675 case Intrinsic::amdgcn_raw_buffer_load:
10676 case Intrinsic::amdgcn_raw_ptr_buffer_load:
10677 case Intrinsic::amdgcn_raw_atomic_buffer_load:
10678 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
10679 case Intrinsic::amdgcn_raw_buffer_load_format:
10680 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
10681 const bool IsFormat =
10682 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
10683 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
10684
10685 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10686 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG);
10687 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
10688 SDValue Ops[] = {
10689 Op.getOperand(0), // Chain
10690 Rsrc, // rsrc
10691 DAG.getConstant(0, DL, MVT::i32), // vindex
10692 VOffset, // voffset
10693 SOffset, // soffset
10694 Offset, // offset
10695 Op.getOperand(5), // cachepolicy, swizzled buffer
10696 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10697 };
10698
10699 auto *M = cast<MemSDNode>(Op);
10700 return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
10701 }
10702 case Intrinsic::amdgcn_struct_buffer_load:
10703 case Intrinsic::amdgcn_struct_ptr_buffer_load:
10704 case Intrinsic::amdgcn_struct_buffer_load_format:
10705 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
10706 case Intrinsic::amdgcn_struct_atomic_buffer_load:
10707 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
10708 const bool IsFormat =
10709 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
10710 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
10711
10712 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10713 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
10714 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
10715 SDValue Ops[] = {
10716 Op.getOperand(0), // Chain
10717 Rsrc, // rsrc
10718 Op.getOperand(3), // vindex
10719 VOffset, // voffset
10720 SOffset, // soffset
10721 Offset, // offset
10722 Op.getOperand(6), // cachepolicy, swizzled buffer
10723 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10724 };
10725
10726 return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops);
10727 }
10728 case Intrinsic::amdgcn_raw_tbuffer_load:
10729 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
10730 MemSDNode *M = cast<MemSDNode>(Op);
10731 EVT LoadVT = Op.getValueType();
10732 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10733 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG);
10734 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
10735
10736 SDValue Ops[] = {
10737 Op.getOperand(0), // Chain
10738 Rsrc, // rsrc
10739 DAG.getConstant(0, DL, MVT::i32), // vindex
10740 VOffset, // voffset
10741 SOffset, // soffset
10742 Offset, // offset
10743 Op.getOperand(5), // format
10744 Op.getOperand(6), // cachepolicy, swizzled buffer
10745 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10746 };
10747
10748 if (LoadVT.getScalarType() == MVT::f16)
10749 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
10750 Ops);
10751 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
10752 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
10753 DAG);
10754 }
10755 case Intrinsic::amdgcn_struct_tbuffer_load:
10756 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
10757 MemSDNode *M = cast<MemSDNode>(Op);
10758 EVT LoadVT = Op.getValueType();
10759 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10760 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
10761 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
10762
10763 SDValue Ops[] = {
10764 Op.getOperand(0), // Chain
10765 Rsrc, // rsrc
10766 Op.getOperand(3), // vindex
10767 VOffset, // voffset
10768 SOffset, // soffset
10769 Offset, // offset
10770 Op.getOperand(6), // format
10771 Op.getOperand(7), // cachepolicy, swizzled buffer
10772 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10773 };
10774
10775 if (LoadVT.getScalarType() == MVT::f16)
10776 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
10777 Ops);
10778 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
10779 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
10780 DAG);
10781 }
10782 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
10783 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
10784 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
10785 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
10786 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
10787 return lowerStructBufferAtomicIntrin(Op, DAG,
10788 AMDGPUISD::BUFFER_ATOMIC_FADD);
10789 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
10790 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
10791 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
10792 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
10793 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
10794 return lowerStructBufferAtomicIntrin(Op, DAG,
10795 AMDGPUISD::BUFFER_ATOMIC_FMIN);
10796 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
10797 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
10798 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
10799 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
10800 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
10801 return lowerStructBufferAtomicIntrin(Op, DAG,
10802 AMDGPUISD::BUFFER_ATOMIC_FMAX);
10803 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
10804 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
10805 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP);
10806 case Intrinsic::amdgcn_raw_buffer_atomic_add:
10807 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
10808 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
10809 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
10810 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
10811 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
10812 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
10813 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
10814 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMIN);
10815 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
10816 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
10817 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMIN);
10818 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
10819 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
10820 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMAX);
10821 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
10822 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
10823 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMAX);
10824 case Intrinsic::amdgcn_raw_buffer_atomic_and:
10825 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
10826 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
10827 case Intrinsic::amdgcn_raw_buffer_atomic_or:
10828 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
10829 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
10830 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
10831 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
10832 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
10833 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
10834 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
10835 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
10836 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
10837 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
10838 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
10839 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
10840 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
10841 return lowerStructBufferAtomicIntrin(Op, DAG,
10842 AMDGPUISD::BUFFER_ATOMIC_SWAP);
10843 case Intrinsic::amdgcn_struct_buffer_atomic_add:
10844 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
10845 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
10846 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
10847 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
10848 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
10849 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
10850 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
10851 return lowerStructBufferAtomicIntrin(Op, DAG,
10852 AMDGPUISD::BUFFER_ATOMIC_SMIN);
10853 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
10854 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
10855 return lowerStructBufferAtomicIntrin(Op, DAG,
10856 AMDGPUISD::BUFFER_ATOMIC_UMIN);
10857 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
10858 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
10859 return lowerStructBufferAtomicIntrin(Op, DAG,
10860 AMDGPUISD::BUFFER_ATOMIC_SMAX);
10861 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
10862 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
10863 return lowerStructBufferAtomicIntrin(Op, DAG,
10864 AMDGPUISD::BUFFER_ATOMIC_UMAX);
10865 case Intrinsic::amdgcn_struct_buffer_atomic_and:
10866 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
10867 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
10868 case Intrinsic::amdgcn_struct_buffer_atomic_or:
10869 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
10870 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
10871 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
10872 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
10873 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
10874 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
10875 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
10876 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
10877 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
10878 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
10879 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
10880 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
10881 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
10882 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_CSUB);
10883 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
10884 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
10885 return lowerStructBufferAtomicIntrin(Op, DAG,
10886 AMDGPUISD::BUFFER_ATOMIC_CSUB);
10887 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
10888 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
10889 return lowerRawBufferAtomicIntrin(Op, DAG,
10890 AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
10891 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
10892 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
10893 return lowerStructBufferAtomicIntrin(Op, DAG,
10894 AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
10895 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
10896 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
10897 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG);
10898 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
10899 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
10900 SDValue Ops[] = {
10901 Op.getOperand(0), // Chain
10902 Op.getOperand(2), // src
10903 Op.getOperand(3), // cmp
10904 Rsrc, // rsrc
10905 DAG.getConstant(0, DL, MVT::i32), // vindex
10906 VOffset, // voffset
10907 SOffset, // soffset
10908 Offset, // offset
10909 Op.getOperand(7), // cachepolicy
10910 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10911 };
10912 EVT VT = Op.getValueType();
10913 auto *M = cast<MemSDNode>(Op);
10914
10915 return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
10916 Op->getVTList(), Ops, VT,
10917 M->getMemOperand());
10918 }
10919 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
10920 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
10921 SDValue Rsrc = bufferRsrcPtrToVector(Op->getOperand(4), DAG);
10922 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(6), DAG);
10923 auto SOffset = selectSOffset(Op.getOperand(7), DAG, Subtarget);
10924 SDValue Ops[] = {
10925 Op.getOperand(0), // Chain
10926 Op.getOperand(2), // src
10927 Op.getOperand(3), // cmp
10928 Rsrc, // rsrc
10929 Op.getOperand(5), // vindex
10930 VOffset, // voffset
10931 SOffset, // soffset
10932 Offset, // offset
10933 Op.getOperand(8), // cachepolicy
10934 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10935 };
10936 EVT VT = Op.getValueType();
10937 auto *M = cast<MemSDNode>(Op);
10938
10939 return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
10940 Op->getVTList(), Ops, VT,
10941 M->getMemOperand());
10942 }
10943 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
10944 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
10945 MemSDNode *M = cast<MemSDNode>(Op);
10946 SDValue NodePtr = M->getOperand(2);
10947 SDValue RayExtent = M->getOperand(3);
10948 SDValue InstanceMask = M->getOperand(4);
10949 SDValue RayOrigin = M->getOperand(5);
10950 SDValue RayDir = M->getOperand(6);
10951 SDValue Offsets = M->getOperand(7);
10952 SDValue TDescr = M->getOperand(8);
10953
10954 assert(NodePtr.getValueType() == MVT::i64);
10955 assert(RayDir.getValueType() == MVT::v3f32);
10956
10957 if (!Subtarget->hasBVHDualAndBVH8Insts()) {
10958 emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
10959 return SDValue();
10960 }
10961
10962 bool IsBVH8 = IntrID == Intrinsic::amdgcn_image_bvh8_intersect_ray;
10963 const unsigned NumVDataDwords = 10;
10964 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
10965 int Opcode = AMDGPU::getMIMGOpcode(
10966 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
10967 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
10968 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
10969 assert(Opcode != -1);
10970
10972 Ops.push_back(NodePtr);
10973 Ops.push_back(DAG.getBuildVector(
10974 MVT::v2i32, DL,
10975 {DAG.getBitcast(MVT::i32, RayExtent),
10976 DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, InstanceMask)}));
10977 Ops.push_back(RayOrigin);
10978 Ops.push_back(RayDir);
10979 Ops.push_back(Offsets);
10980 Ops.push_back(TDescr);
10981 Ops.push_back(M->getChain());
10982
10983 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
10984 MachineMemOperand *MemRef = M->getMemOperand();
10985 DAG.setNodeMemRefs(NewNode, {MemRef});
10986 return SDValue(NewNode, 0);
10987 }
10988 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
10989 MemSDNode *M = cast<MemSDNode>(Op);
10990 SDValue NodePtr = M->getOperand(2);
10991 SDValue RayExtent = M->getOperand(3);
10992 SDValue RayOrigin = M->getOperand(4);
10993 SDValue RayDir = M->getOperand(5);
10994 SDValue RayInvDir = M->getOperand(6);
10995 SDValue TDescr = M->getOperand(7);
10996
10997 assert(NodePtr.getValueType() == MVT::i32 ||
10998 NodePtr.getValueType() == MVT::i64);
10999 assert(RayDir.getValueType() == MVT::v3f16 ||
11000 RayDir.getValueType() == MVT::v3f32);
11001
11002 if (!Subtarget->hasGFX10_AEncoding()) {
11003 emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
11004 return SDValue();
11005 }
11006
11007 const bool IsGFX11 = AMDGPU::isGFX11(*Subtarget);
11008 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
11009 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
11010 const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
11011 const bool Is64 = NodePtr.getValueType() == MVT::i64;
11012 const unsigned NumVDataDwords = 4;
11013 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
11014 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
11015 const bool UseNSA = (Subtarget->hasNSAEncoding() &&
11016 NumVAddrs <= Subtarget->getNSAMaxSize()) ||
11017 IsGFX12Plus;
11018 const unsigned BaseOpcodes[2][2] = {
11019 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
11020 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
11021 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
11022 int Opcode;
11023 if (UseNSA) {
11024 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
11025 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
11026 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
11027 : AMDGPU::MIMGEncGfx10NSA,
11028 NumVDataDwords, NumVAddrDwords);
11029 } else {
11030 assert(!IsGFX12Plus);
11031 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
11032 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
11033 : AMDGPU::MIMGEncGfx10Default,
11034 NumVDataDwords, NumVAddrDwords);
11035 }
11036 assert(Opcode != -1);
11037
11039
11040 auto packLanes = [&DAG, &Ops, &DL](SDValue Op, bool IsAligned) {
11042 DAG.ExtractVectorElements(Op, Lanes, 0, 3);
11043 if (Lanes[0].getValueSizeInBits() == 32) {
11044 for (unsigned I = 0; I < 3; ++I)
11045 Ops.push_back(DAG.getBitcast(MVT::i32, Lanes[I]));
11046 } else {
11047 if (IsAligned) {
11048 Ops.push_back(DAG.getBitcast(
11049 MVT::i32,
11050 DAG.getBuildVector(MVT::v2f16, DL, {Lanes[0], Lanes[1]})));
11051 Ops.push_back(Lanes[2]);
11052 } else {
11053 SDValue Elt0 = Ops.pop_back_val();
11054 Ops.push_back(DAG.getBitcast(
11055 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL, {Elt0, Lanes[0]})));
11056 Ops.push_back(DAG.getBitcast(
11057 MVT::i32,
11058 DAG.getBuildVector(MVT::v2f16, DL, {Lanes[1], Lanes[2]})));
11059 }
11060 }
11061 };
11062
11063 if (UseNSA && IsGFX11Plus) {
11064 Ops.push_back(NodePtr);
11065 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
11066 Ops.push_back(RayOrigin);
11067 if (IsA16) {
11068 SmallVector<SDValue, 3> DirLanes, InvDirLanes, MergedLanes;
11069 DAG.ExtractVectorElements(RayDir, DirLanes, 0, 3);
11070 DAG.ExtractVectorElements(RayInvDir, InvDirLanes, 0, 3);
11071 for (unsigned I = 0; I < 3; ++I) {
11072 MergedLanes.push_back(DAG.getBitcast(
11073 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL,
11074 {DirLanes[I], InvDirLanes[I]})));
11075 }
11076 Ops.push_back(DAG.getBuildVector(MVT::v3i32, DL, MergedLanes));
11077 } else {
11078 Ops.push_back(RayDir);
11079 Ops.push_back(RayInvDir);
11080 }
11081 } else {
11082 if (Is64)
11083 DAG.ExtractVectorElements(DAG.getBitcast(MVT::v2i32, NodePtr), Ops, 0,
11084 2);
11085 else
11086 Ops.push_back(NodePtr);
11087
11088 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
11089 packLanes(RayOrigin, true);
11090 packLanes(RayDir, true);
11091 packLanes(RayInvDir, false);
11092 }
11093
11094 if (!UseNSA) {
11095 // Build a single vector containing all the operands so far prepared.
11096 if (NumVAddrDwords > 12) {
11097 SDValue Undef = DAG.getPOISON(MVT::i32);
11098 Ops.append(16 - Ops.size(), Undef);
11099 }
11100 assert(Ops.size() >= 8 && Ops.size() <= 12);
11101 SDValue MergedOps =
11102 DAG.getBuildVector(MVT::getVectorVT(MVT::i32, Ops.size()), DL, Ops);
11103 Ops.clear();
11104 Ops.push_back(MergedOps);
11105 }
11106
11107 Ops.push_back(TDescr);
11108 Ops.push_back(DAG.getTargetConstant(IsA16, DL, MVT::i1));
11109 Ops.push_back(M->getChain());
11110
11111 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
11112 MachineMemOperand *MemRef = M->getMemOperand();
11113 DAG.setNodeMemRefs(NewNode, {MemRef});
11114 return SDValue(NewNode, 0);
11115 }
11116 case Intrinsic::amdgcn_global_atomic_fmin_num:
11117 case Intrinsic::amdgcn_global_atomic_fmax_num:
11118 case Intrinsic::amdgcn_flat_atomic_fmin_num:
11119 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
11120 MemSDNode *M = cast<MemSDNode>(Op);
11121 SDValue Ops[] = {
11122 M->getOperand(0), // Chain
11123 M->getOperand(2), // Ptr
11124 M->getOperand(3) // Value
11125 };
11126 unsigned Opcode = 0;
11127 switch (IntrID) {
11128 case Intrinsic::amdgcn_global_atomic_fmin_num:
11129 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
11130 Opcode = ISD::ATOMIC_LOAD_FMIN;
11131 break;
11132 }
11133 case Intrinsic::amdgcn_global_atomic_fmax_num:
11134 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
11135 Opcode = ISD::ATOMIC_LOAD_FMAX;
11136 break;
11137 }
11138 default:
11139 llvm_unreachable("unhandled atomic opcode");
11140 }
11141 return DAG.getAtomic(Opcode, SDLoc(Op), M->getMemoryVT(), M->getVTList(),
11142 Ops, M->getMemOperand());
11143 }
11144 case Intrinsic::amdgcn_s_get_barrier_state:
11145 case Intrinsic::amdgcn_s_get_named_barrier_state: {
11146 SDValue Chain = Op->getOperand(0);
11148 unsigned Opc;
11149
11150 if (isa<ConstantSDNode>(Op->getOperand(2))) {
11151 uint64_t BarID = cast<ConstantSDNode>(Op->getOperand(2))->getZExtValue();
11152 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)
11153 BarID = (BarID >> 4) & 0x3F;
11154 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
11155 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
11156 Ops.push_back(K);
11157 Ops.push_back(Chain);
11158 } else {
11159 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
11160 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
11161 SDValue M0Val;
11162 M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, Op->getOperand(2),
11163 DAG.getShiftAmountConstant(4, MVT::i32, DL));
11164 M0Val = SDValue(
11165 DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
11166 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
11167 0);
11168 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
11169 } else
11170 Ops.push_back(copyToM0(DAG, Chain, DL, Op->getOperand(2)).getValue(0));
11171 }
11172
11173 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
11174 return SDValue(NewMI, 0);
11175 }
11176 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
11177 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
11178 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
11179 MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);
11180 SDValue Chain = Op->getOperand(0);
11181 SDValue Ptr = Op->getOperand(2);
11182 EVT VT = Op->getValueType(0);
11183 return DAG.getAtomicLoad(ISD::NON_EXTLOAD, DL, MII->getMemoryVT(), VT,
11184 Chain, Ptr, MII->getMemOperand());
11185 }
11186 default:
11187
11188 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
11190 return lowerImage(Op, ImageDimIntr, DAG, true);
11191
11192 return SDValue();
11193 }
11194}
11195
11196// Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
11197// dwordx4 if on SI and handle TFE loads.
11198SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,
11199 SDVTList VTList,
11200 ArrayRef<SDValue> Ops, EVT MemVT,
11201 MachineMemOperand *MMO,
11202 SelectionDAG &DAG) const {
11203 LLVMContext &C = *DAG.getContext();
11204 MachineFunction &MF = DAG.getMachineFunction();
11205 EVT VT = VTList.VTs[0];
11206
11207 assert(VTList.NumVTs == 2 || VTList.NumVTs == 3);
11208 bool IsTFE = VTList.NumVTs == 3;
11209 if (IsTFE) {
11210 unsigned NumValueDWords = divideCeil(VT.getSizeInBits(), 32);
11211 unsigned NumOpDWords = NumValueDWords + 1;
11212 EVT OpDWordsVT = EVT::getVectorVT(C, MVT::i32, NumOpDWords);
11213 SDVTList OpDWordsVTList = DAG.getVTList(OpDWordsVT, VTList.VTs[2]);
11214 MachineMemOperand *OpDWordsMMO =
11215 MF.getMachineMemOperand(MMO, 0, NumOpDWords * 4);
11216 SDValue Op = getMemIntrinsicNode(Opcode, DL, OpDWordsVTList, Ops,
11217 OpDWordsVT, OpDWordsMMO, DAG);
11218 SDValue Status = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
11219 DAG.getVectorIdxConstant(NumValueDWords, DL));
11220 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
11221 SDValue ValueDWords =
11222 NumValueDWords == 1
11223 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op, ZeroIdx)
11225 EVT::getVectorVT(C, MVT::i32, NumValueDWords), Op,
11226 ZeroIdx);
11227 SDValue Value = DAG.getNode(ISD::BITCAST, DL, VT, ValueDWords);
11228 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
11229 }
11230
11231 if (!Subtarget->hasDwordx3LoadStores() &&
11232 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
11233 EVT WidenedVT = EVT::getVectorVT(C, VT.getVectorElementType(), 4);
11234 EVT WidenedMemVT = EVT::getVectorVT(C, MemVT.getVectorElementType(), 4);
11235 MachineMemOperand *WidenedMMO = MF.getMachineMemOperand(MMO, 0, 16);
11236 SDVTList WidenedVTList = DAG.getVTList(WidenedVT, VTList.VTs[1]);
11237 SDValue Op = DAG.getMemIntrinsicNode(Opcode, DL, WidenedVTList, Ops,
11238 WidenedMemVT, WidenedMMO);
11240 DAG.getVectorIdxConstant(0, DL));
11241 return DAG.getMergeValues({Value, SDValue(Op.getNode(), 1)}, DL);
11242 }
11243
11244 return DAG.getMemIntrinsicNode(Opcode, DL, VTList, Ops, MemVT, MMO);
11245}
11246
11247SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG,
11248 bool ImageStore) const {
11249 EVT StoreVT = VData.getValueType();
11250
11251 // No change for f16 and legal vector D16 types.
11252 if (!StoreVT.isVector())
11253 return VData;
11254
11255 SDLoc DL(VData);
11256 unsigned NumElements = StoreVT.getVectorNumElements();
11257
11258 if (Subtarget->hasUnpackedD16VMem()) {
11259 // We need to unpack the packed data to store.
11260 EVT IntStoreVT = StoreVT.changeTypeToInteger();
11261 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
11262
11263 EVT EquivStoreVT =
11264 EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElements);
11265 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
11266 return DAG.UnrollVectorOp(ZExt.getNode());
11267 }
11268
11269 // The sq block of gfx8.1 does not estimate register use correctly for d16
11270 // image store instructions. The data operand is computed as if it were not a
11271 // d16 image instruction.
11272 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
11273 // Bitcast to i16
11274 EVT IntStoreVT = StoreVT.changeTypeToInteger();
11275 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
11276
11277 // Decompose into scalars
11279 DAG.ExtractVectorElements(IntVData, Elts);
11280
11281 // Group pairs of i16 into v2i16 and bitcast to i32
11282 SmallVector<SDValue, 4> PackedElts;
11283 for (unsigned I = 0; I < Elts.size() / 2; I += 1) {
11284 SDValue Pair =
11285 DAG.getBuildVector(MVT::v2i16, DL, {Elts[I * 2], Elts[I * 2 + 1]});
11286 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
11287 PackedElts.push_back(IntPair);
11288 }
11289 if ((NumElements % 2) == 1) {
11290 // Handle v3i16
11291 unsigned I = Elts.size() / 2;
11292 SDValue Pair = DAG.getBuildVector(MVT::v2i16, DL,
11293 {Elts[I * 2], DAG.getPOISON(MVT::i16)});
11294 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
11295 PackedElts.push_back(IntPair);
11296 }
11297
11298 // Pad using UNDEF
11299 PackedElts.resize(Elts.size(), DAG.getPOISON(MVT::i32));
11300
11301 // Build final vector
11302 EVT VecVT =
11303 EVT::getVectorVT(*DAG.getContext(), MVT::i32, PackedElts.size());
11304 return DAG.getBuildVector(VecVT, DL, PackedElts);
11305 }
11306
11307 if (NumElements == 3) {
11308 EVT IntStoreVT =
11310 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
11311
11312 EVT WidenedStoreVT = EVT::getVectorVT(
11313 *DAG.getContext(), StoreVT.getVectorElementType(), NumElements + 1);
11314 EVT WidenedIntVT = EVT::getIntegerVT(*DAG.getContext(),
11315 WidenedStoreVT.getStoreSizeInBits());
11316 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenedIntVT, IntVData);
11317 return DAG.getNode(ISD::BITCAST, DL, WidenedStoreVT, ZExt);
11318 }
11319
11320 assert(isTypeLegal(StoreVT));
11321 return VData;
11322}
11323
11324SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
11325 SelectionDAG &DAG) const {
11326 SDLoc DL(Op);
11327 SDValue Chain = Op.getOperand(0);
11328 unsigned IntrinsicID = Op.getConstantOperandVal(1);
11329 MachineFunction &MF = DAG.getMachineFunction();
11330
11331 switch (IntrinsicID) {
11332 case Intrinsic::amdgcn_exp_compr: {
11333 if (!Subtarget->hasCompressedExport()) {
11334 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
11336 "intrinsic not supported on subtarget", DL.getDebugLoc()));
11337 }
11338 SDValue Src0 = Op.getOperand(4);
11339 SDValue Src1 = Op.getOperand(5);
11340 // Hack around illegal type on SI by directly selecting it.
11341 if (isTypeLegal(Src0.getValueType()))
11342 return SDValue();
11343
11344 const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
11345 SDValue Undef = DAG.getPOISON(MVT::f32);
11346 const SDValue Ops[] = {
11347 Op.getOperand(2), // tgt
11348 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0), // src0
11349 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1), // src1
11350 Undef, // src2
11351 Undef, // src3
11352 Op.getOperand(7), // vm
11353 DAG.getTargetConstant(1, DL, MVT::i1), // compr
11354 Op.getOperand(3), // en
11355 Op.getOperand(0) // Chain
11356 };
11357
11358 unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
11359 return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0);
11360 }
11361
11362 case Intrinsic::amdgcn_struct_tbuffer_store:
11363 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
11364 SDValue VData = Op.getOperand(2);
11365 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
11366 if (IsD16)
11367 VData = handleD16VData(VData, DAG);
11368 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
11369 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
11370 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
11371 SDValue Ops[] = {
11372 Chain,
11373 VData, // vdata
11374 Rsrc, // rsrc
11375 Op.getOperand(4), // vindex
11376 VOffset, // voffset
11377 SOffset, // soffset
11378 Offset, // offset
11379 Op.getOperand(7), // format
11380 Op.getOperand(8), // cachepolicy, swizzled buffer
11381 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
11382 };
11383 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
11384 : AMDGPUISD::TBUFFER_STORE_FORMAT;
11385 MemSDNode *M = cast<MemSDNode>(Op);
11386 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11387 M->getMemoryVT(), M->getMemOperand());
11388 }
11389
11390 case Intrinsic::amdgcn_raw_tbuffer_store:
11391 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
11392 SDValue VData = Op.getOperand(2);
11393 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
11394 if (IsD16)
11395 VData = handleD16VData(VData, DAG);
11396 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
11397 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
11398 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
11399 SDValue Ops[] = {
11400 Chain,
11401 VData, // vdata
11402 Rsrc, // rsrc
11403 DAG.getConstant(0, DL, MVT::i32), // vindex
11404 VOffset, // voffset
11405 SOffset, // soffset
11406 Offset, // offset
11407 Op.getOperand(6), // format
11408 Op.getOperand(7), // cachepolicy, swizzled buffer
11409 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
11410 };
11411 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
11412 : AMDGPUISD::TBUFFER_STORE_FORMAT;
11413 MemSDNode *M = cast<MemSDNode>(Op);
11414 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11415 M->getMemoryVT(), M->getMemOperand());
11416 }
11417
11418 case Intrinsic::amdgcn_raw_buffer_store:
11419 case Intrinsic::amdgcn_raw_ptr_buffer_store:
11420 case Intrinsic::amdgcn_raw_buffer_store_format:
11421 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
11422 const bool IsFormat =
11423 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
11424 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
11425
11426 SDValue VData = Op.getOperand(2);
11427 EVT VDataVT = VData.getValueType();
11428 EVT EltType = VDataVT.getScalarType();
11429 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
11430 if (IsD16) {
11431 VData = handleD16VData(VData, DAG);
11432 VDataVT = VData.getValueType();
11433 }
11434
11435 if (!isTypeLegal(VDataVT)) {
11436 VData =
11437 DAG.getNode(ISD::BITCAST, DL,
11438 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
11439 }
11440
11441 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
11442 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
11443 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
11444 SDValue Ops[] = {
11445 Chain,
11446 VData,
11447 Rsrc,
11448 DAG.getConstant(0, DL, MVT::i32), // vindex
11449 VOffset, // voffset
11450 SOffset, // soffset
11451 Offset, // offset
11452 Op.getOperand(6), // cachepolicy, swizzled buffer
11453 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
11454 };
11455 unsigned Opc =
11456 IsFormat ? AMDGPUISD::BUFFER_STORE_FORMAT : AMDGPUISD::BUFFER_STORE;
11457 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
11458 MemSDNode *M = cast<MemSDNode>(Op);
11459
11460 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
11461 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
11462 return handleByteShortBufferStores(DAG, VDataVT, DL, Ops, M);
11463
11464 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11465 M->getMemoryVT(), M->getMemOperand());
11466 }
11467
11468 case Intrinsic::amdgcn_struct_buffer_store:
11469 case Intrinsic::amdgcn_struct_ptr_buffer_store:
11470 case Intrinsic::amdgcn_struct_buffer_store_format:
11471 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
11472 const bool IsFormat =
11473 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
11474 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
11475
11476 SDValue VData = Op.getOperand(2);
11477 EVT VDataVT = VData.getValueType();
11478 EVT EltType = VDataVT.getScalarType();
11479 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
11480
11481 if (IsD16) {
11482 VData = handleD16VData(VData, DAG);
11483 VDataVT = VData.getValueType();
11484 }
11485
11486 if (!isTypeLegal(VDataVT)) {
11487 VData =
11488 DAG.getNode(ISD::BITCAST, DL,
11489 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
11490 }
11491
11492 auto Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
11493 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
11494 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
11495 SDValue Ops[] = {
11496 Chain,
11497 VData,
11498 Rsrc,
11499 Op.getOperand(4), // vindex
11500 VOffset, // voffset
11501 SOffset, // soffset
11502 Offset, // offset
11503 Op.getOperand(7), // cachepolicy, swizzled buffer
11504 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
11505 };
11506 unsigned Opc =
11507 !IsFormat ? AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
11508 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
11509 MemSDNode *M = cast<MemSDNode>(Op);
11510
11511 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
11512 EVT VDataType = VData.getValueType().getScalarType();
11513 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
11514 return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
11515
11516 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11517 M->getMemoryVT(), M->getMemOperand());
11518 }
11519 case Intrinsic::amdgcn_raw_buffer_load_lds:
11520 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
11521 case Intrinsic::amdgcn_struct_buffer_load_lds:
11522 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
11523 if (!Subtarget->hasVMemToLDSLoad())
11524 return SDValue();
11525 unsigned Opc;
11526 bool HasVIndex =
11527 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
11528 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
11529 unsigned OpOffset = HasVIndex ? 1 : 0;
11530 SDValue VOffset = Op.getOperand(5 + OpOffset);
11531 bool HasVOffset = !isNullConstant(VOffset);
11532 unsigned Size = Op->getConstantOperandVal(4);
11533
11534 switch (Size) {
11535 default:
11536 return SDValue();
11537 case 1:
11538 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
11539 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
11540 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
11541 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
11542 break;
11543 case 2:
11544 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
11545 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
11546 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
11547 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
11548 break;
11549 case 4:
11550 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
11551 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
11552 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
11553 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
11554 break;
11555 case 12:
11556 if (!Subtarget->hasLDSLoadB96_B128())
11557 return SDValue();
11558 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
11559 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
11560 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
11561 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
11562 break;
11563 case 16:
11564 if (!Subtarget->hasLDSLoadB96_B128())
11565 return SDValue();
11566 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
11567 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
11568 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
11569 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
11570 break;
11571 }
11572
11573 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
11574
11576
11577 if (HasVIndex && HasVOffset)
11578 Ops.push_back(DAG.getBuildVector(MVT::v2i32, DL,
11579 {Op.getOperand(5), // VIndex
11580 VOffset}));
11581 else if (HasVIndex)
11582 Ops.push_back(Op.getOperand(5));
11583 else if (HasVOffset)
11584 Ops.push_back(VOffset);
11585
11586 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
11587 Ops.push_back(Rsrc);
11588 Ops.push_back(Op.getOperand(6 + OpOffset)); // soffset
11589 Ops.push_back(Op.getOperand(7 + OpOffset)); // imm offset
11590 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
11591 unsigned Aux = Op.getConstantOperandVal(8 + OpOffset);
11592 Ops.push_back(DAG.getTargetConstant(
11593 Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12),
11594 DL, MVT::i8)); // cpol
11595 Ops.push_back(DAG.getTargetConstant(
11596 Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
11597 ? 1
11598 : 0,
11599 DL, MVT::i8)); // swz
11600 Ops.push_back(M0Val.getValue(0)); // Chain
11601 Ops.push_back(M0Val.getValue(1)); // Glue
11602
11603 auto *M = cast<MemSDNode>(Op);
11604 MachineMemOperand *LoadMMO = M->getMemOperand();
11605 // Don't set the offset value here because the pointer points to the base of
11606 // the buffer.
11607 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
11608
11609 MachinePointerInfo StorePtrI = LoadPtrI;
11610 LoadPtrI.V = PoisonValue::get(
11614
11615 auto F = LoadMMO->getFlags() &
11617 LoadMMO =
11619 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
11620
11621 MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
11622 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t),
11623 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
11624
11625 auto *Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops);
11626 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
11627
11628 return SDValue(Load, 0);
11629 }
11630 // Buffers are handled by LowerBufferFatPointers, and we're going to go
11631 // for "trust me" that the remaining cases are global pointers until
11632 // such time as we can put two mem operands on an intrinsic.
11633 case Intrinsic::amdgcn_load_to_lds:
11634 case Intrinsic::amdgcn_global_load_lds: {
11635 if (!Subtarget->hasVMemToLDSLoad())
11636 return SDValue();
11637
11638 unsigned Opc;
11639 unsigned Size = Op->getConstantOperandVal(4);
11640 switch (Size) {
11641 default:
11642 return SDValue();
11643 case 1:
11644 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
11645 break;
11646 case 2:
11647 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
11648 break;
11649 case 4:
11650 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
11651 break;
11652 case 12:
11653 if (!Subtarget->hasLDSLoadB96_B128())
11654 return SDValue();
11655 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
11656 break;
11657 case 16:
11658 if (!Subtarget->hasLDSLoadB96_B128())
11659 return SDValue();
11660 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
11661 break;
11662 }
11663
11664 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
11665
11667
11668 SDValue Addr = Op.getOperand(2); // Global ptr
11669 SDValue VOffset;
11670 // Try to split SAddr and VOffset. Global and LDS pointers share the same
11671 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
11672 if (Addr->isDivergent() && Addr->isAnyAdd()) {
11673 SDValue LHS = Addr.getOperand(0);
11674 SDValue RHS = Addr.getOperand(1);
11675
11676 if (LHS->isDivergent())
11677 std::swap(LHS, RHS);
11678
11679 if (!LHS->isDivergent() && RHS.getOpcode() == ISD::ZERO_EXTEND &&
11680 RHS.getOperand(0).getValueType() == MVT::i32) {
11681 // add (i64 sgpr), (zero_extend (i32 vgpr))
11682 Addr = LHS;
11683 VOffset = RHS.getOperand(0);
11684 }
11685 }
11686
11687 Ops.push_back(Addr);
11688 if (!Addr->isDivergent()) {
11690 if (!VOffset)
11691 VOffset =
11692 SDValue(DAG.getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32,
11693 DAG.getTargetConstant(0, DL, MVT::i32)),
11694 0);
11695 Ops.push_back(VOffset);
11696 }
11697
11698 Ops.push_back(Op.getOperand(5)); // Offset
11699
11700 unsigned Aux = Op.getConstantOperandVal(6);
11701 Ops.push_back(DAG.getTargetConstant(Aux & ~AMDGPU::CPol::VIRTUAL_BITS, DL,
11702 MVT::i32)); // CPol
11703
11704 Ops.push_back(M0Val.getValue(0)); // Chain
11705 Ops.push_back(M0Val.getValue(1)); // Glue
11706
11707 auto *M = cast<MemSDNode>(Op);
11708 MachineMemOperand *LoadMMO = M->getMemOperand();
11709 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
11710 LoadPtrI.Offset = Op->getConstantOperandVal(5);
11711 MachinePointerInfo StorePtrI = LoadPtrI;
11712 LoadPtrI.V = PoisonValue::get(
11716 auto F = LoadMMO->getFlags() &
11718 LoadMMO =
11720 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
11721 MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
11722 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t), Align(4),
11723 LoadMMO->getAAInfo());
11724
11725 auto *Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
11726 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
11727
11728 return SDValue(Load, 0);
11729 }
11730 case Intrinsic::amdgcn_end_cf:
11731 return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
11732 Op->getOperand(2), Chain),
11733 0);
11734 case Intrinsic::amdgcn_s_barrier_init:
11735 case Intrinsic::amdgcn_s_barrier_signal_var: {
11736 // these two intrinsics have two operands: barrier pointer and member count
11737 SDValue Chain = Op->getOperand(0);
11739 SDValue BarOp = Op->getOperand(2);
11740 SDValue CntOp = Op->getOperand(3);
11741 SDValue M0Val;
11742 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
11743 ? AMDGPU::S_BARRIER_INIT_M0
11744 : AMDGPU::S_BARRIER_SIGNAL_M0;
11745 // extract the BarrierID from bits 4-9 of BarOp
11746 SDValue BarID;
11747 BarID = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
11748 DAG.getShiftAmountConstant(4, MVT::i32, DL));
11749 BarID =
11750 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, BarID,
11751 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
11752 0);
11753 // Member count should be put into M0[ShAmt:+6]
11754 // Barrier ID should be put into M0[5:0]
11755 M0Val =
11756 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, CntOp,
11757 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
11758 0);
11759 constexpr unsigned ShAmt = 16;
11760 M0Val = DAG.getNode(ISD::SHL, DL, MVT::i32, CntOp,
11761 DAG.getShiftAmountConstant(ShAmt, MVT::i32, DL));
11762
11763 M0Val = SDValue(
11764 DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, M0Val, BarID), 0);
11765
11766 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
11767
11768 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
11769 return SDValue(NewMI, 0);
11770 }
11771 case Intrinsic::amdgcn_s_wakeup_barrier: {
11772 if (!Subtarget->hasSWakeupBarrier())
11773 return SDValue();
11774 [[fallthrough]];
11775 }
11776 case Intrinsic::amdgcn_s_barrier_join: {
11777 // these three intrinsics have one operand: barrier pointer
11778 SDValue Chain = Op->getOperand(0);
11780 SDValue BarOp = Op->getOperand(2);
11781 unsigned Opc;
11782
11783 if (isa<ConstantSDNode>(BarOp)) {
11784 uint64_t BarVal = cast<ConstantSDNode>(BarOp)->getZExtValue();
11785 switch (IntrinsicID) {
11786 default:
11787 return SDValue();
11788 case Intrinsic::amdgcn_s_barrier_join:
11789 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
11790 break;
11791 case Intrinsic::amdgcn_s_wakeup_barrier:
11792 Opc = AMDGPU::S_WAKEUP_BARRIER_IMM;
11793 break;
11794 }
11795 // extract the BarrierID from bits 4-9 of the immediate
11796 unsigned BarID = (BarVal >> 4) & 0x3F;
11797 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
11798 Ops.push_back(K);
11799 Ops.push_back(Chain);
11800 } else {
11801 switch (IntrinsicID) {
11802 default:
11803 return SDValue();
11804 case Intrinsic::amdgcn_s_barrier_join:
11805 Opc = AMDGPU::S_BARRIER_JOIN_M0;
11806 break;
11807 case Intrinsic::amdgcn_s_wakeup_barrier:
11808 Opc = AMDGPU::S_WAKEUP_BARRIER_M0;
11809 break;
11810 }
11811 // extract the BarrierID from bits 4-9 of BarOp, copy to M0[5:0]
11812 SDValue M0Val;
11813 M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
11814 DAG.getShiftAmountConstant(4, MVT::i32, DL));
11815 M0Val =
11816 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
11817 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
11818 0);
11819 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
11820 }
11821
11822 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
11823 return SDValue(NewMI, 0);
11824 }
11825 case Intrinsic::amdgcn_s_prefetch_data: {
11826 // For non-global address space preserve the chain and remove the call.
11828 return Op.getOperand(0);
11829 return Op;
11830 }
11831 case Intrinsic::amdgcn_s_buffer_prefetch_data: {
11832 SDValue Ops[] = {
11833 Chain, bufferRsrcPtrToVector(Op.getOperand(2), DAG),
11834 Op.getOperand(3), // offset
11835 Op.getOperand(4), // length
11836 };
11837
11838 MemSDNode *M = cast<MemSDNode>(Op);
11839 return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_PREFETCH_DATA, DL,
11840 Op->getVTList(), Ops, M->getMemoryVT(),
11841 M->getMemOperand());
11842 }
11843 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
11844 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
11845 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
11846 MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);
11847 SDValue Chain = Op->getOperand(0);
11848 SDValue Ptr = Op->getOperand(2);
11849 SDValue Val = Op->getOperand(3);
11850 return DAG.getAtomic(ISD::ATOMIC_STORE, DL, MII->getMemoryVT(), Chain, Val,
11851 Ptr, MII->getMemOperand());
11852 }
11853 default: {
11854 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
11856 return lowerImage(Op, ImageDimIntr, DAG, true);
11857
11858 return Op;
11859 }
11860 }
11861}
11862
11863// Return whether the operation has NoUnsignedWrap property.
11864static bool isNoUnsignedWrap(SDValue Addr) {
11865 return (Addr.getOpcode() == ISD::ADD &&
11866 Addr->getFlags().hasNoUnsignedWrap()) ||
11867 Addr->getOpcode() == ISD::OR;
11868}
11869
11871 EVT PtrVT) const {
11872 return PtrVT == MVT::i64;
11873}
11874
11876 EVT PtrVT) const {
11877 return true;
11878}
11879
11880// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
11881// offset (the offset that is included in bounds checking and swizzling, to be
11882// split between the instruction's voffset and immoffset fields) and soffset
11883// (the offset that is excluded from bounds checking and swizzling, to go in
11884// the instruction's soffset field). This function takes the first kind of
11885// offset and figures out how to split it between voffset and immoffset.
11886std::pair<SDValue, SDValue>
11887SITargetLowering::splitBufferOffsets(SDValue Offset, SelectionDAG &DAG) const {
11888 SDLoc DL(Offset);
11889 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
11890 SDValue N0 = Offset;
11891 ConstantSDNode *C1 = nullptr;
11892
11893 if ((C1 = dyn_cast<ConstantSDNode>(N0)))
11894 N0 = SDValue();
11895 else if (DAG.isBaseWithConstantOffset(N0)) {
11896 // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before
11897 // being added, so we can only safely match a 32-bit addition with no
11898 // unsigned overflow.
11899 bool CheckNUW = Subtarget->hasGFX1250Insts();
11900 if (!CheckNUW || isNoUnsignedWrap(N0)) {
11901 C1 = cast<ConstantSDNode>(N0.getOperand(1));
11902 N0 = N0.getOperand(0);
11903 }
11904 }
11905
11906 if (C1) {
11907 unsigned ImmOffset = C1->getZExtValue();
11908 // If the immediate value is too big for the immoffset field, put only bits
11909 // that would normally fit in the immoffset field. The remaining value that
11910 // is copied/added for the voffset field is a large power of 2, and it
11911 // stands more chance of being CSEd with the copy/add for another similar
11912 // load/store.
11913 // However, do not do that rounding down if that is a negative
11914 // number, as it appears to be illegal to have a negative offset in the
11915 // vgpr, even if adding the immediate offset makes it positive.
11916 unsigned Overflow = ImmOffset & ~MaxImm;
11917 ImmOffset -= Overflow;
11918 if ((int32_t)Overflow < 0) {
11919 Overflow += ImmOffset;
11920 ImmOffset = 0;
11921 }
11922 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(ImmOffset, DL, MVT::i32));
11923 if (Overflow) {
11924 auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32);
11925 if (!N0)
11926 N0 = OverflowVal;
11927 else {
11928 SDValue Ops[] = {N0, OverflowVal};
11929 N0 = DAG.getNode(ISD::ADD, DL, MVT::i32, Ops);
11930 }
11931 }
11932 }
11933 if (!N0)
11934 N0 = DAG.getConstant(0, DL, MVT::i32);
11935 if (!C1)
11936 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(0, DL, MVT::i32));
11937 return {N0, SDValue(C1, 0)};
11938}
11939
11940// Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store
11941// the three offsets (voffset, soffset and instoffset) into the SDValue[3] array
11942// pointed to by Offsets.
11943void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
11944 SelectionDAG &DAG, SDValue *Offsets,
11945 Align Alignment) const {
11946 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
11947 SDLoc DL(CombinedOffset);
11948 if (auto *C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
11949 uint32_t Imm = C->getZExtValue();
11950 uint32_t SOffset, ImmOffset;
11951 if (TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
11952 Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
11953 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
11954 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
11955 return;
11956 }
11957 }
11958 if (DAG.isBaseWithConstantOffset(CombinedOffset)) {
11959 // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before
11960 // being added, so we can only safely match a 32-bit addition with no
11961 // unsigned overflow.
11962 bool CheckNUW = Subtarget->hasGFX1250Insts();
11963 SDValue N0 = CombinedOffset.getOperand(0);
11964 SDValue N1 = CombinedOffset.getOperand(1);
11965 uint32_t SOffset, ImmOffset;
11966 int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
11967 if (Offset >= 0 && (!CheckNUW || isNoUnsignedWrap(CombinedOffset)) &&
11968 TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) {
11969 Offsets[0] = N0;
11970 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
11971 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
11972 return;
11973 }
11974 }
11975
11976 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
11977 ? DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32)
11978 : DAG.getConstant(0, DL, MVT::i32);
11979
11980 Offsets[0] = CombinedOffset;
11981 Offsets[1] = SOffsetZero;
11982 Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32);
11983}
11984
11985SDValue SITargetLowering::bufferRsrcPtrToVector(SDValue MaybePointer,
11986 SelectionDAG &DAG) const {
11987 if (!MaybePointer.getValueType().isScalarInteger())
11988 return MaybePointer;
11989
11990 SDValue Rsrc = DAG.getBitcast(MVT::v4i32, MaybePointer);
11991 return Rsrc;
11992}
11993
11994// Wrap a global or flat pointer into a buffer intrinsic using the flags
11995// specified in the intrinsic.
11996SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op,
11997 SelectionDAG &DAG) const {
11998 SDLoc Loc(Op);
11999
12000 SDValue Pointer = Op->getOperand(1);
12001 SDValue Stride = Op->getOperand(2);
12002 SDValue NumRecords = Op->getOperand(3);
12003 SDValue Flags = Op->getOperand(4);
12004
12005 SDValue ExtStride = DAG.getAnyExtOrTrunc(Stride, Loc, MVT::i32);
12006 SDValue Rsrc;
12007
12008 if (Subtarget->has45BitNumRecordsBufferResource()) {
12009 SDValue Zero = DAG.getConstant(0, Loc, MVT::i32);
12010 // Build the lower 64-bit value, which has a 57-bit base and the lower 7-bit
12011 // num_records.
12012 SDValue ExtPointer = DAG.getAnyExtOrTrunc(Pointer, Loc, MVT::i64);
12013 SDValue NumRecordsLHS =
12014 DAG.getNode(ISD::SHL, Loc, MVT::i64, NumRecords,
12015 DAG.getShiftAmountConstant(57, MVT::i32, Loc));
12016 SDValue LowHalf =
12017 DAG.getNode(ISD::OR, Loc, MVT::i64, ExtPointer, NumRecordsLHS);
12018
12019 // Build the higher 64-bit value, which has the higher 38-bit num_records,
12020 // 6-bit zero (omit), 16-bit stride and scale and 4-bit flag.
12021 SDValue NumRecordsRHS =
12022 DAG.getNode(ISD::SRL, Loc, MVT::i64, NumRecords,
12023 DAG.getShiftAmountConstant(7, MVT::i32, Loc));
12024 SDValue ShiftedStride =
12025 DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride,
12026 DAG.getShiftAmountConstant(12, MVT::i32, Loc));
12027 SDValue ExtShiftedStrideVec =
12028 DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v2i32, Zero, ShiftedStride);
12029 SDValue ExtShiftedStride =
12030 DAG.getNode(ISD::BITCAST, Loc, MVT::i64, ExtShiftedStrideVec);
12031 SDValue ShiftedFlags =
12032 DAG.getNode(ISD::SHL, Loc, MVT::i32, Flags,
12033 DAG.getShiftAmountConstant(28, MVT::i32, Loc));
12034 SDValue ExtShiftedFlagsVec =
12035 DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v2i32, Zero, ShiftedFlags);
12036 SDValue ExtShiftedFlags =
12037 DAG.getNode(ISD::BITCAST, Loc, MVT::i64, ExtShiftedFlagsVec);
12038 SDValue CombinedFields =
12039 DAG.getNode(ISD::OR, Loc, MVT::i64, NumRecordsRHS, ExtShiftedStride);
12040 SDValue HighHalf =
12041 DAG.getNode(ISD::OR, Loc, MVT::i64, CombinedFields, ExtShiftedFlags);
12042
12043 Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v2i64, LowHalf, HighHalf);
12044 } else {
12045 NumRecords = DAG.getAnyExtOrTrunc(NumRecords, Loc, MVT::i32);
12046 auto [LowHalf, HighHalf] =
12047 DAG.SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
12048 SDValue Mask = DAG.getConstant(0x0000ffff, Loc, MVT::i32);
12049 SDValue Masked = DAG.getNode(ISD::AND, Loc, MVT::i32, HighHalf, Mask);
12050 SDValue ShiftedStride =
12051 DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride,
12052 DAG.getShiftAmountConstant(16, MVT::i32, Loc));
12053 SDValue NewHighHalf =
12054 DAG.getNode(ISD::OR, Loc, MVT::i32, Masked, ShiftedStride);
12055
12056 Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v4i32, LowHalf, NewHighHalf,
12057 NumRecords, Flags);
12058 }
12059
12060 SDValue RsrcPtr = DAG.getNode(ISD::BITCAST, Loc, MVT::i128, Rsrc);
12061 return RsrcPtr;
12062}
12063
12064// Handle 8 bit and 16 bit buffer loads
12065SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG,
12066 EVT LoadVT, SDLoc DL,
12068 MachineMemOperand *MMO,
12069 bool IsTFE) const {
12070 EVT IntVT = LoadVT.changeTypeToInteger();
12071
12072 if (IsTFE) {
12073 unsigned Opc = (LoadVT.getScalarType() == MVT::i8)
12074 ? AMDGPUISD::BUFFER_LOAD_UBYTE_TFE
12075 : AMDGPUISD::BUFFER_LOAD_USHORT_TFE;
12076 MachineFunction &MF = DAG.getMachineFunction();
12077 MachineMemOperand *OpMMO = MF.getMachineMemOperand(MMO, 0, 8);
12078 SDVTList VTs = DAG.getVTList(MVT::v2i32, MVT::Other);
12079 SDValue Op = getMemIntrinsicNode(Opc, DL, VTs, Ops, MVT::v2i32, OpMMO, DAG);
12080 SDValue Status = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
12081 DAG.getConstant(1, DL, MVT::i32));
12082 SDValue Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
12083 DAG.getConstant(0, DL, MVT::i32));
12084 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Data);
12085 SDValue Value = DAG.getNode(ISD::BITCAST, DL, LoadVT, Trunc);
12086 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
12087 }
12088
12089 unsigned Opc = LoadVT.getScalarType() == MVT::i8
12090 ? AMDGPUISD::BUFFER_LOAD_UBYTE
12091 : AMDGPUISD::BUFFER_LOAD_USHORT;
12092
12093 SDVTList ResList = DAG.getVTList(MVT::i32, MVT::Other);
12094 SDValue BufferLoad =
12095 DAG.getMemIntrinsicNode(Opc, DL, ResList, Ops, IntVT, MMO);
12096 SDValue LoadVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, BufferLoad);
12097 LoadVal = DAG.getNode(ISD::BITCAST, DL, LoadVT, LoadVal);
12098
12099 return DAG.getMergeValues({LoadVal, BufferLoad.getValue(1)}, DL);
12100}
12101
12102// Handle 8 bit and 16 bit buffer stores
12103SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG,
12104 EVT VDataType, SDLoc DL,
12105 SDValue Ops[],
12106 MemSDNode *M) const {
12107 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
12108 Ops[1] = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Ops[1]);
12109
12110 SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]);
12111 Ops[1] = BufferStoreExt;
12112 unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE
12113 : AMDGPUISD::BUFFER_STORE_SHORT;
12114 ArrayRef<SDValue> OpsRef = ArrayRef(&Ops[0], 9);
12115 return DAG.getMemIntrinsicNode(Opc, DL, M->getVTList(), OpsRef, VDataType,
12116 M->getMemOperand());
12117}
12118
12120 SDValue Op, const SDLoc &SL, EVT VT) {
12121 if (VT.bitsLT(Op.getValueType()))
12122 return DAG.getNode(ISD::TRUNCATE, SL, VT, Op);
12123
12124 switch (ExtType) {
12125 case ISD::SEXTLOAD:
12126 return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op);
12127 case ISD::ZEXTLOAD:
12128 return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op);
12129 case ISD::EXTLOAD:
12130 return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op);
12131 case ISD::NON_EXTLOAD:
12132 return Op;
12133 }
12134
12135 llvm_unreachable("invalid ext type");
12136}
12137
12138// Try to turn 8 and 16-bit scalar loads into SMEM eligible 32-bit loads.
12139// TODO: Skip this on GFX12 which does have scalar sub-dword loads.
12140SDValue SITargetLowering::widenLoad(LoadSDNode *Ld,
12141 DAGCombinerInfo &DCI) const {
12142 SelectionDAG &DAG = DCI.DAG;
12143 if (Ld->getAlign() < Align(4) || Ld->isDivergent())
12144 return SDValue();
12145
12146 // FIXME: Constant loads should all be marked invariant.
12147 unsigned AS = Ld->getAddressSpace();
12148 if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
12150 (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
12151 return SDValue();
12152
12153 // Don't do this early, since it may interfere with adjacent load merging for
12154 // illegal types. We can avoid losing alignment information for exotic types
12155 // pre-legalize.
12156 EVT MemVT = Ld->getMemoryVT();
12157 if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
12158 MemVT.getSizeInBits() >= 32)
12159 return SDValue();
12160
12161 SDLoc SL(Ld);
12162
12163 assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
12164 "unexpected vector extload");
12165
12166 // TODO: Drop only high part of range.
12167 SDValue Ptr = Ld->getBasePtr();
12168 SDValue NewLoad = DAG.getLoad(
12169 ISD::UNINDEXED, ISD::NON_EXTLOAD, MVT::i32, SL, Ld->getChain(), Ptr,
12170 Ld->getOffset(), Ld->getPointerInfo(), MVT::i32, Ld->getAlign(),
12171 Ld->getMemOperand()->getFlags(), Ld->getAAInfo(),
12172 nullptr); // Drop ranges
12173
12174 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
12175 if (MemVT.isFloatingPoint()) {
12177 "unexpected fp extload");
12178 TruncVT = MemVT.changeTypeToInteger();
12179 }
12180
12181 SDValue Cvt = NewLoad;
12182 if (Ld->getExtensionType() == ISD::SEXTLOAD) {
12183 Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad,
12184 DAG.getValueType(TruncVT));
12185 } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
12187 Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT);
12188 } else {
12190 }
12191
12192 EVT VT = Ld->getValueType(0);
12193 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
12194
12195 DCI.AddToWorklist(Cvt.getNode());
12196
12197 // We may need to handle exotic cases, such as i16->i64 extloads, so insert
12198 // the appropriate extension from the 32-bit load.
12199 Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT);
12200 DCI.AddToWorklist(Cvt.getNode());
12201
12202 // Handle conversion back to floating point if necessary.
12203 Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt);
12204
12205 return DAG.getMergeValues({Cvt, NewLoad.getValue(1)}, SL);
12206}
12207
12209 const SIMachineFunctionInfo &Info) {
12210 // TODO: Should check if the address can definitely not access stack.
12211 if (Info.isEntryFunction())
12212 return Info.getUserSGPRInfo().hasFlatScratchInit();
12213 return true;
12214}
12215
12216SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
12217 SDLoc DL(Op);
12218 LoadSDNode *Load = cast<LoadSDNode>(Op);
12219 ISD::LoadExtType ExtType = Load->getExtensionType();
12220 EVT MemVT = Load->getMemoryVT();
12221 MachineMemOperand *MMO = Load->getMemOperand();
12222
12223 if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
12224 if (MemVT == MVT::i16 && isTypeLegal(MVT::i16))
12225 return SDValue();
12226
12227 // FIXME: Copied from PPC
12228 // First, load into 32 bits, then truncate to 1 bit.
12229
12230 SDValue Chain = Load->getChain();
12231 SDValue BasePtr = Load->getBasePtr();
12232
12233 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
12234
12235 SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, BasePtr,
12236 RealMemVT, MMO);
12237
12238 if (!MemVT.isVector()) {
12239 SDValue Ops[] = {DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
12240 NewLD.getValue(1)};
12241
12242 return DAG.getMergeValues(Ops, DL);
12243 }
12244
12246 for (unsigned I = 0, N = MemVT.getVectorNumElements(); I != N; ++I) {
12247 SDValue Elt = DAG.getNode(ISD::SRL, DL, MVT::i32, NewLD,
12248 DAG.getConstant(I, DL, MVT::i32));
12249
12250 Elts.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Elt));
12251 }
12252
12253 SDValue Ops[] = {DAG.getBuildVector(MemVT, DL, Elts), NewLD.getValue(1)};
12254
12255 return DAG.getMergeValues(Ops, DL);
12256 }
12257
12258 if (!MemVT.isVector())
12259 return SDValue();
12260
12261 assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
12262 "Custom lowering for non-i32 vectors hasn't been implemented.");
12263
12264 Align Alignment = Load->getAlign();
12265 unsigned AS = Load->getAddressSpace();
12266 if (Subtarget->hasLDSMisalignedBugInWGPMode() &&
12267 AS == AMDGPUAS::FLAT_ADDRESS &&
12268 Alignment.value() < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) {
12269 return SplitVectorLoad(Op, DAG);
12270 }
12271
12272 MachineFunction &MF = DAG.getMachineFunction();
12273 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
12274 // If there is a possibility that flat instruction access scratch memory
12275 // then we need to use the same legalization rules we use for private.
12276 if (AS == AMDGPUAS::FLAT_ADDRESS &&
12277 !Subtarget->hasMultiDwordFlatScratchAddressing())
12278 AS = addressMayBeAccessedAsPrivate(Load->getMemOperand(), *MFI)
12281
12282 unsigned NumElements = MemVT.getVectorNumElements();
12283
12284 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
12286 (AS == AMDGPUAS::GLOBAL_ADDRESS &&
12287 Subtarget->getScalarizeGlobalBehavior() && Load->isSimple() &&
12288 (Load->isInvariant() || isMemOpHasNoClobberedMemOperand(Load)))) {
12289 if ((!Op->isDivergent() || AMDGPU::isUniformMMO(MMO)) &&
12290 Alignment >= Align(4) && NumElements < 32) {
12291 if (MemVT.isPow2VectorType() ||
12292 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
12293 return SDValue();
12294 return WidenOrSplitVectorLoad(Op, DAG);
12295 }
12296 // Non-uniform loads will be selected to MUBUF instructions, so they
12297 // have the same legalization requirements as global and private
12298 // loads.
12299 //
12300 }
12301 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
12304 if (NumElements > 4)
12305 return SplitVectorLoad(Op, DAG);
12306 // v3 loads not supported on SI.
12307 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12308 return WidenOrSplitVectorLoad(Op, DAG);
12309
12310 // v3 and v4 loads are supported for private and global memory.
12311 return SDValue();
12312 }
12313 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
12314 // Depending on the setting of the private_element_size field in the
12315 // resource descriptor, we can only make private accesses up to a certain
12316 // size.
12317 switch (Subtarget->getMaxPrivateElementSize()) {
12318 case 4: {
12319 auto [Op0, Op1] = scalarizeVectorLoad(Load, DAG);
12320 return DAG.getMergeValues({Op0, Op1}, DL);
12321 }
12322 case 8:
12323 if (NumElements > 2)
12324 return SplitVectorLoad(Op, DAG);
12325 return SDValue();
12326 case 16:
12327 // Same as global/flat
12328 if (NumElements > 4)
12329 return SplitVectorLoad(Op, DAG);
12330 // v3 loads not supported on SI.
12331 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12332 return WidenOrSplitVectorLoad(Op, DAG);
12333
12334 return SDValue();
12335 default:
12336 llvm_unreachable("unsupported private_element_size");
12337 }
12338 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
12339 unsigned Fast = 0;
12340 auto Flags = Load->getMemOperand()->getFlags();
12342 Load->getAlign(), Flags, &Fast) &&
12343 Fast > 1)
12344 return SDValue();
12345
12346 if (MemVT.isVector())
12347 return SplitVectorLoad(Op, DAG);
12348 }
12349
12351 MemVT, *Load->getMemOperand())) {
12352 auto [Op0, Op1] = expandUnalignedLoad(Load, DAG);
12353 return DAG.getMergeValues({Op0, Op1}, DL);
12354 }
12355
12356 return SDValue();
12357}
12358
12359SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
12360 EVT VT = Op.getValueType();
12361 if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256 ||
12362 VT.getSizeInBits() == 512)
12363 return splitTernaryVectorOp(Op, DAG);
12364
12365 assert(VT.getSizeInBits() == 64);
12366
12367 SDLoc DL(Op);
12368 SDValue Cond = DAG.getFreeze(Op.getOperand(0));
12369
12370 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
12371 SDValue One = DAG.getConstant(1, DL, MVT::i32);
12372
12373 SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
12374 SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
12375
12376 SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
12377 SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
12378
12379 SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
12380
12381 SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
12382 SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
12383
12384 SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
12385
12386 SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
12387 return DAG.getNode(ISD::BITCAST, DL, VT, Res);
12388}
12389
12390// Catch division cases where we can use shortcuts with rcp and rsq
12391// instructions.
12392SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
12393 SelectionDAG &DAG) const {
12394 SDLoc SL(Op);
12395 SDValue LHS = Op.getOperand(0);
12396 SDValue RHS = Op.getOperand(1);
12397 EVT VT = Op.getValueType();
12398 const SDNodeFlags Flags = Op->getFlags();
12399
12400 bool AllowInaccurateRcp = Flags.hasApproximateFuncs();
12401
12402 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
12403 // Without !fpmath accuracy information, we can't do more because we don't
12404 // know exactly whether rcp is accurate enough to meet !fpmath requirement.
12405 // f16 is always accurate enough
12406 if (!AllowInaccurateRcp && VT != MVT::f16 && VT != MVT::bf16)
12407 return SDValue();
12408
12409 if (CLHS->isExactlyValue(1.0)) {
12410 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
12411 // the CI documentation has a worst case error of 1 ulp.
12412 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
12413 // use it as long as we aren't trying to use denormals.
12414 //
12415 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
12416
12417 // 1.0 / sqrt(x) -> rsq(x)
12418
12419 // XXX - Is afn sufficient to do this for f64? The maximum ULP
12420 // error seems really high at 2^29 ULP.
12421 // 1.0 / x -> rcp(x)
12422 return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
12423 }
12424
12425 // Same as for 1.0, but expand the sign out of the constant.
12426 if (CLHS->isExactlyValue(-1.0)) {
12427 // -1.0 / x -> rcp (fneg x)
12428 SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
12429 return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
12430 }
12431 }
12432
12433 // For f16 and bf16 require afn or arcp.
12434 // For f32 require afn.
12435 if (!AllowInaccurateRcp &&
12436 ((VT != MVT::f16 && VT != MVT::bf16) || !Flags.hasAllowReciprocal()))
12437 return SDValue();
12438
12439 // Turn into multiply by the reciprocal.
12440 // x / y -> x * (1.0 / y)
12441 SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
12442 return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
12443}
12444
12445SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op,
12446 SelectionDAG &DAG) const {
12447 SDLoc SL(Op);
12448 SDValue X = Op.getOperand(0);
12449 SDValue Y = Op.getOperand(1);
12450 EVT VT = Op.getValueType();
12451 const SDNodeFlags Flags = Op->getFlags();
12452
12453 bool AllowInaccurateDiv = Flags.hasApproximateFuncs();
12454 if (!AllowInaccurateDiv)
12455 return SDValue();
12456
12457 SDValue NegY = DAG.getNode(ISD::FNEG, SL, VT, Y);
12458 SDValue One = DAG.getConstantFP(1.0, SL, VT);
12459
12460 SDValue R = DAG.getNode(AMDGPUISD::RCP, SL, VT, Y);
12461 SDValue Tmp0 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
12462
12463 R = DAG.getNode(ISD::FMA, SL, VT, Tmp0, R, R);
12464 SDValue Tmp1 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
12465 R = DAG.getNode(ISD::FMA, SL, VT, Tmp1, R, R);
12466 SDValue Ret = DAG.getNode(ISD::FMUL, SL, VT, X, R);
12467 SDValue Tmp2 = DAG.getNode(ISD::FMA, SL, VT, NegY, Ret, X);
12468 return DAG.getNode(ISD::FMA, SL, VT, Tmp2, R, Ret);
12469}
12470
12471static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
12472 EVT VT, SDValue A, SDValue B, SDValue GlueChain,
12473 SDNodeFlags Flags) {
12474 if (GlueChain->getNumValues() <= 1) {
12475 return DAG.getNode(Opcode, SL, VT, A, B, Flags);
12476 }
12477
12478 assert(GlueChain->getNumValues() == 3);
12479
12480 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
12481 switch (Opcode) {
12482 default:
12483 llvm_unreachable("no chain equivalent for opcode");
12484 case ISD::FMUL:
12485 Opcode = AMDGPUISD::FMUL_W_CHAIN;
12486 break;
12487 }
12488
12489 return DAG.getNode(Opcode, SL, VTList,
12490 {GlueChain.getValue(1), A, B, GlueChain.getValue(2)},
12491 Flags);
12492}
12493
12494static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
12495 EVT VT, SDValue A, SDValue B, SDValue C,
12496 SDValue GlueChain, SDNodeFlags Flags) {
12497 if (GlueChain->getNumValues() <= 1) {
12498 return DAG.getNode(Opcode, SL, VT, {A, B, C}, Flags);
12499 }
12500
12501 assert(GlueChain->getNumValues() == 3);
12502
12503 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
12504 switch (Opcode) {
12505 default:
12506 llvm_unreachable("no chain equivalent for opcode");
12507 case ISD::FMA:
12508 Opcode = AMDGPUISD::FMA_W_CHAIN;
12509 break;
12510 }
12511
12512 return DAG.getNode(Opcode, SL, VTList,
12513 {GlueChain.getValue(1), A, B, C, GlueChain.getValue(2)},
12514 Flags);
12515}
12516
12517SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
12518 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
12519 return FastLowered;
12520
12521 SDLoc SL(Op);
12522 EVT VT = Op.getValueType();
12523 SDValue LHS = Op.getOperand(0);
12524 SDValue RHS = Op.getOperand(1);
12525
12526 SDValue LHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, LHS);
12527 SDValue RHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, RHS);
12528
12529 if (VT == MVT::bf16) {
12530 SDValue ExtDiv =
12531 DAG.getNode(ISD::FDIV, SL, MVT::f32, LHSExt, RHSExt, Op->getFlags());
12532 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ExtDiv,
12533 DAG.getTargetConstant(0, SL, MVT::i32));
12534 }
12535
12536 assert(VT == MVT::f16);
12537
12538 // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
12539 // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
12540 // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
12541 // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
12542 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
12543 // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
12544 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
12545 // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
12546 // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
12547 // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
12548 // q16.u = opx(V_CVT_F16_F32, q32.u);
12549 // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
12550
12551 // We will use ISD::FMA on targets that don't support ISD::FMAD.
12552 unsigned FMADOpCode =
12554 SDValue NegRHSExt = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHSExt);
12555 SDValue Rcp =
12556 DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, RHSExt, Op->getFlags());
12557 SDValue Quot =
12558 DAG.getNode(ISD::FMUL, SL, MVT::f32, LHSExt, Rcp, Op->getFlags());
12559 SDValue Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
12560 Op->getFlags());
12561 Quot = DAG.getNode(FMADOpCode, SL, MVT::f32, Err, Rcp, Quot, Op->getFlags());
12562 Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
12563 Op->getFlags());
12564 SDValue Tmp = DAG.getNode(ISD::FMUL, SL, MVT::f32, Err, Rcp, Op->getFlags());
12565 SDValue TmpCast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Tmp);
12566 TmpCast = DAG.getNode(ISD::AND, SL, MVT::i32, TmpCast,
12567 DAG.getConstant(0xff800000, SL, MVT::i32));
12568 Tmp = DAG.getNode(ISD::BITCAST, SL, MVT::f32, TmpCast);
12569 Quot = DAG.getNode(ISD::FADD, SL, MVT::f32, Tmp, Quot, Op->getFlags());
12570 SDValue RDst = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot,
12571 DAG.getTargetConstant(0, SL, MVT::i32));
12572 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, RDst, RHS, LHS,
12573 Op->getFlags());
12574}
12575
12576// Faster 2.5 ULP division that does not support denormals.
12577SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
12578 SDNodeFlags Flags = Op->getFlags();
12579 SDLoc SL(Op);
12580 SDValue LHS = Op.getOperand(1);
12581 SDValue RHS = Op.getOperand(2);
12582
12583 // TODO: The combiner should probably handle elimination of redundant fabs.
12585 ? RHS
12586 : DAG.getNode(ISD::FABS, SL, MVT::f32, RHS, Flags);
12587
12588 const APFloat K0Val(0x1p+96f);
12589 const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
12590
12591 const APFloat K1Val(0x1p-32f);
12592 const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
12593
12594 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
12595
12596 EVT SetCCVT =
12597 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
12598
12599 SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
12600
12601 SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One, Flags);
12602
12603 r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3, Flags);
12604
12605 // rcp does not support denormals.
12606 SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1, Flags);
12607
12608 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0, Flags);
12609
12610 return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul, Flags);
12611}
12612
12613// Returns immediate value for setting the F32 denorm mode when using the
12614// S_DENORM_MODE instruction.
12616 const SIMachineFunctionInfo *Info,
12617 const GCNSubtarget *ST) {
12618 assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE");
12619 uint32_t DPDenormModeDefault = Info->getMode().fpDenormModeDPValue();
12620 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
12621 return DAG.getTargetConstant(Mode, SDLoc(), MVT::i32);
12622}
12623
12624SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
12625 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
12626 return FastLowered;
12627
12628 // The selection matcher assumes anything with a chain selecting to a
12629 // mayRaiseFPException machine instruction. Since we're introducing a chain
12630 // here, we need to explicitly report nofpexcept for the regular fdiv
12631 // lowering.
12632 SDNodeFlags Flags = Op->getFlags();
12633 Flags.setNoFPExcept(true);
12634
12635 SDLoc SL(Op);
12636 SDValue LHS = Op.getOperand(0);
12637 SDValue RHS = Op.getOperand(1);
12638
12639 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
12640
12641 SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
12642
12643 SDValue DenominatorScaled =
12644 DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {RHS, RHS, LHS}, Flags);
12645 SDValue NumeratorScaled =
12646 DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {LHS, RHS, LHS}, Flags);
12647
12648 // Denominator is scaled to not be denormal, so using rcp is ok.
12649 SDValue ApproxRcp =
12650 DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, DenominatorScaled, Flags);
12651 SDValue NegDivScale0 =
12652 DAG.getNode(ISD::FNEG, SL, MVT::f32, DenominatorScaled, Flags);
12653
12654 using namespace AMDGPU::Hwreg;
12655 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
12656 const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i32);
12657
12658 const MachineFunction &MF = DAG.getMachineFunction();
12659 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
12660 const DenormalMode DenormMode = Info->getMode().FP32Denormals;
12661
12662 const bool PreservesDenormals = DenormMode == DenormalMode::getIEEE();
12663 const bool HasDynamicDenormals =
12664 (DenormMode.Input == DenormalMode::Dynamic) ||
12665 (DenormMode.Output == DenormalMode::Dynamic);
12666
12667 SDValue SavedDenormMode;
12668
12669 if (!PreservesDenormals) {
12670 // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV
12671 // lowering. The chain dependence is insufficient, and we need glue. We do
12672 // not need the glue variants in a strictfp function.
12673
12674 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
12675
12676 SDValue Glue = DAG.getEntryNode();
12677 if (HasDynamicDenormals) {
12678 SDNode *GetReg = DAG.getMachineNode(AMDGPU::S_GETREG_B32, SL,
12679 DAG.getVTList(MVT::i32, MVT::Glue),
12680 {BitField, Glue});
12681 SavedDenormMode = SDValue(GetReg, 0);
12682
12683 Glue = DAG.getMergeValues(
12684 {DAG.getEntryNode(), SDValue(GetReg, 0), SDValue(GetReg, 1)}, SL);
12685 }
12686
12687 SDNode *EnableDenorm;
12688 if (Subtarget->hasDenormModeInst()) {
12689 const SDValue EnableDenormValue =
12690 getSPDenormModeValue(FP_DENORM_FLUSH_NONE, DAG, Info, Subtarget);
12691
12692 EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, Glue,
12693 EnableDenormValue)
12694 .getNode();
12695 } else {
12696 const SDValue EnableDenormValue =
12697 DAG.getConstant(FP_DENORM_FLUSH_NONE, SL, MVT::i32);
12698 EnableDenorm = DAG.getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
12699 {EnableDenormValue, BitField, Glue});
12700 }
12701
12702 SDValue Ops[3] = {NegDivScale0, SDValue(EnableDenorm, 0),
12703 SDValue(EnableDenorm, 1)};
12704
12705 NegDivScale0 = DAG.getMergeValues(Ops, SL);
12706 }
12707
12708 SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
12709 ApproxRcp, One, NegDivScale0, Flags);
12710
12711 SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
12712 ApproxRcp, Fma0, Flags);
12713
12714 SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled, Fma1,
12715 Fma1, Flags);
12716
12717 SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
12718 NumeratorScaled, Mul, Flags);
12719
12720 SDValue Fma3 =
12721 getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma2, Fma1, Mul, Fma2, Flags);
12722
12723 SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
12724 NumeratorScaled, Fma3, Flags);
12725
12726 if (!PreservesDenormals) {
12727 SDNode *DisableDenorm;
12728 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
12729 const SDValue DisableDenormValue = getSPDenormModeValue(
12730 FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, Info, Subtarget);
12731
12732 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
12733 DisableDenorm =
12734 DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs,
12735 Fma4.getValue(1), DisableDenormValue, Fma4.getValue(2))
12736 .getNode();
12737 } else {
12738 assert(HasDynamicDenormals == (bool)SavedDenormMode);
12739 const SDValue DisableDenormValue =
12740 HasDynamicDenormals
12741 ? SavedDenormMode
12742 : DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
12743
12744 DisableDenorm = DAG.getMachineNode(
12745 AMDGPU::S_SETREG_B32, SL, MVT::Other,
12746 {DisableDenormValue, BitField, Fma4.getValue(1), Fma4.getValue(2)});
12747 }
12748
12749 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
12750 SDValue(DisableDenorm, 0), DAG.getRoot());
12751 DAG.setRoot(OutputChain);
12752 }
12753
12754 SDValue Scale = NumeratorScaled.getValue(1);
12755 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
12756 {Fma4, Fma1, Fma3, Scale}, Flags);
12757
12758 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS, Flags);
12759}
12760
12761SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
12762 if (SDValue FastLowered = lowerFastUnsafeFDIV64(Op, DAG))
12763 return FastLowered;
12764
12765 SDLoc SL(Op);
12766 SDValue X = Op.getOperand(0);
12767 SDValue Y = Op.getOperand(1);
12768
12769 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
12770
12771 SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
12772
12773 SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
12774
12775 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
12776
12777 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
12778
12779 SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
12780
12781 SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
12782
12783 SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
12784
12785 SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
12786
12787 SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
12788 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
12789
12790 SDValue Fma4 =
12791 DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Mul, DivScale1);
12792
12793 SDValue Scale;
12794
12795 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
12796 // Workaround a hardware bug on SI where the condition output from div_scale
12797 // is not usable.
12798
12799 const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
12800
12801 // Figure out if the scale to use for div_fmas.
12802 SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
12803 SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
12804 SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
12805 SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
12806
12807 SDValue NumHi =
12808 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
12809 SDValue DenHi =
12810 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
12811
12812 SDValue Scale0Hi =
12813 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
12814 SDValue Scale1Hi =
12815 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
12816
12817 SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
12818 SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
12819 Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
12820 } else {
12821 Scale = DivScale1.getValue(1);
12822 }
12823
12824 SDValue Fmas =
12825 DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64, Fma4, Fma3, Mul, Scale);
12826
12827 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
12828}
12829
12830SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
12831 EVT VT = Op.getValueType();
12832
12833 if (VT == MVT::f32)
12834 return LowerFDIV32(Op, DAG);
12835
12836 if (VT == MVT::f64)
12837 return LowerFDIV64(Op, DAG);
12838
12839 if (VT == MVT::f16 || VT == MVT::bf16)
12840 return LowerFDIV16(Op, DAG);
12841
12842 llvm_unreachable("Unexpected type for fdiv");
12843}
12844
12845SDValue SITargetLowering::LowerFFREXP(SDValue Op, SelectionDAG &DAG) const {
12846 SDLoc dl(Op);
12847 SDValue Val = Op.getOperand(0);
12848 EVT VT = Val.getValueType();
12849 EVT ResultExpVT = Op->getValueType(1);
12850 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
12851
12852 SDValue Mant = DAG.getNode(
12854 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_mant, dl, MVT::i32), Val);
12855
12856 SDValue Exp = DAG.getNode(
12857 ISD::INTRINSIC_WO_CHAIN, dl, InstrExpVT,
12858 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_exp, dl, MVT::i32), Val);
12859
12860 if (Subtarget->hasFractBug()) {
12861 SDValue Fabs = DAG.getNode(ISD::FABS, dl, VT, Val);
12862 SDValue Inf =
12864
12865 SDValue IsFinite = DAG.getSetCC(dl, MVT::i1, Fabs, Inf, ISD::SETOLT);
12866 SDValue Zero = DAG.getConstant(0, dl, InstrExpVT);
12867 Exp = DAG.getNode(ISD::SELECT, dl, InstrExpVT, IsFinite, Exp, Zero);
12868 Mant = DAG.getNode(ISD::SELECT, dl, VT, IsFinite, Mant, Val);
12869 }
12870
12871 SDValue CastExp = DAG.getSExtOrTrunc(Exp, dl, ResultExpVT);
12872 return DAG.getMergeValues({Mant, CastExp}, dl);
12873}
12874
12875SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
12876 SDLoc DL(Op);
12877 StoreSDNode *Store = cast<StoreSDNode>(Op);
12878 EVT VT = Store->getMemoryVT();
12879
12880 if (VT == MVT::i1) {
12881 return DAG.getTruncStore(
12882 Store->getChain(), DL,
12883 DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
12884 Store->getBasePtr(), MVT::i1, Store->getMemOperand());
12885 }
12886
12887 assert(VT.isVector() &&
12888 Store->getValue().getValueType().getScalarType() == MVT::i32);
12889
12890 unsigned AS = Store->getAddressSpace();
12891 if (Subtarget->hasLDSMisalignedBugInWGPMode() &&
12892 AS == AMDGPUAS::FLAT_ADDRESS &&
12893 Store->getAlign().value() < VT.getStoreSize() &&
12894 VT.getSizeInBits() > 32) {
12895 return SplitVectorStore(Op, DAG);
12896 }
12897
12898 MachineFunction &MF = DAG.getMachineFunction();
12899 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
12900 // If there is a possibility that flat instruction access scratch memory
12901 // then we need to use the same legalization rules we use for private.
12902 if (AS == AMDGPUAS::FLAT_ADDRESS &&
12903 !Subtarget->hasMultiDwordFlatScratchAddressing())
12904 AS = addressMayBeAccessedAsPrivate(Store->getMemOperand(), *MFI)
12907
12908 unsigned NumElements = VT.getVectorNumElements();
12910 if (NumElements > 4)
12911 return SplitVectorStore(Op, DAG);
12912 // v3 stores not supported on SI.
12913 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12914 return SplitVectorStore(Op, DAG);
12915
12917 VT, *Store->getMemOperand()))
12918 return expandUnalignedStore(Store, DAG);
12919
12920 return SDValue();
12921 }
12922 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
12923 switch (Subtarget->getMaxPrivateElementSize()) {
12924 case 4:
12925 return scalarizeVectorStore(Store, DAG);
12926 case 8:
12927 if (NumElements > 2)
12928 return SplitVectorStore(Op, DAG);
12929 return SDValue();
12930 case 16:
12931 if (NumElements > 4 ||
12932 (NumElements == 3 && !Subtarget->hasFlatScratchEnabled()))
12933 return SplitVectorStore(Op, DAG);
12934 return SDValue();
12935 default:
12936 llvm_unreachable("unsupported private_element_size");
12937 }
12938 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
12939 unsigned Fast = 0;
12940 auto Flags = Store->getMemOperand()->getFlags();
12942 Store->getAlign(), Flags, &Fast) &&
12943 Fast > 1)
12944 return SDValue();
12945
12946 if (VT.isVector())
12947 return SplitVectorStore(Op, DAG);
12948
12949 return expandUnalignedStore(Store, DAG);
12950 }
12951
12952 // Probably an invalid store. If so we'll end up emitting a selection error.
12953 return SDValue();
12954}
12955
12956// Avoid the full correct expansion for f32 sqrt when promoting from f16.
12957SDValue SITargetLowering::lowerFSQRTF16(SDValue Op, SelectionDAG &DAG) const {
12958 SDLoc SL(Op);
12959 assert(!Subtarget->has16BitInsts());
12960 SDNodeFlags Flags = Op->getFlags();
12961 SDValue Ext =
12962 DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op.getOperand(0), Flags);
12963
12964 SDValue SqrtID = DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, SL, MVT::i32);
12965 SDValue Sqrt =
12966 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::f32, SqrtID, Ext, Flags);
12967
12968 return DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Sqrt,
12969 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
12970}
12971
12972SDValue SITargetLowering::lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const {
12973 SDLoc DL(Op);
12974 SDNodeFlags Flags = Op->getFlags();
12975 MVT VT = Op.getValueType().getSimpleVT();
12976 const SDValue X = Op.getOperand(0);
12977
12978 if (allowApproxFunc(DAG, Flags)) {
12979 // Instruction is 1ulp but ignores denormals.
12980 return DAG.getNode(
12982 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32), X, Flags);
12983 }
12984
12985 SDValue ScaleThreshold = DAG.getConstantFP(0x1.0p-96f, DL, VT);
12986 SDValue NeedScale = DAG.getSetCC(DL, MVT::i1, X, ScaleThreshold, ISD::SETOLT);
12987
12988 SDValue ScaleUpFactor = DAG.getConstantFP(0x1.0p+32f, DL, VT);
12989
12990 SDValue ScaledX = DAG.getNode(ISD::FMUL, DL, VT, X, ScaleUpFactor, Flags);
12991
12992 SDValue SqrtX =
12993 DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledX, X, Flags);
12994
12995 SDValue SqrtS;
12996 if (needsDenormHandlingF32(DAG, X, Flags)) {
12997 SDValue SqrtID =
12998 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32);
12999 SqrtS = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, SqrtID, SqrtX, Flags);
13000
13001 SDValue SqrtSAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, SqrtS);
13002 SDValue SqrtSNextDownInt =
13003 DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
13004 DAG.getAllOnesConstant(DL, MVT::i32));
13005 SDValue SqrtSNextDown = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextDownInt);
13006
13007 SDValue NegSqrtSNextDown =
13008 DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextDown, Flags);
13009
13010 SDValue SqrtVP =
13011 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
13012
13013 SDValue SqrtSNextUpInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
13014 DAG.getConstant(1, DL, MVT::i32));
13015 SDValue SqrtSNextUp = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextUpInt);
13016
13017 SDValue NegSqrtSNextUp = DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextUp, Flags);
13018 SDValue SqrtVS =
13019 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
13020
13021 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
13022 SDValue SqrtVPLE0 = DAG.getSetCC(DL, MVT::i1, SqrtVP, Zero, ISD::SETOLE);
13023
13024 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPLE0, SqrtSNextDown, SqrtS,
13025 Flags);
13026
13027 SDValue SqrtVPVSGT0 = DAG.getSetCC(DL, MVT::i1, SqrtVS, Zero, ISD::SETOGT);
13028 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPVSGT0, SqrtSNextUp, SqrtS,
13029 Flags);
13030 } else {
13031 SDValue SqrtR = DAG.getNode(AMDGPUISD::RSQ, DL, VT, SqrtX, Flags);
13032
13033 SqrtS = DAG.getNode(ISD::FMUL, DL, VT, SqrtX, SqrtR, Flags);
13034
13035 SDValue Half = DAG.getConstantFP(0.5f, DL, VT);
13036 SDValue SqrtH = DAG.getNode(ISD::FMUL, DL, VT, SqrtR, Half, Flags);
13037 SDValue NegSqrtH = DAG.getNode(ISD::FNEG, DL, VT, SqrtH, Flags);
13038
13039 SDValue SqrtE = DAG.getNode(ISD::FMA, DL, VT, NegSqrtH, SqrtS, Half, Flags);
13040 SqrtH = DAG.getNode(ISD::FMA, DL, VT, SqrtH, SqrtE, SqrtH, Flags);
13041 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtS, SqrtE, SqrtS, Flags);
13042
13043 SDValue NegSqrtS = DAG.getNode(ISD::FNEG, DL, VT, SqrtS, Flags);
13044 SDValue SqrtD =
13045 DAG.getNode(ISD::FMA, DL, VT, NegSqrtS, SqrtS, SqrtX, Flags);
13046 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtD, SqrtH, SqrtS, Flags);
13047 }
13048
13049 SDValue ScaleDownFactor = DAG.getConstantFP(0x1.0p-16f, DL, VT);
13050
13051 SDValue ScaledDown =
13052 DAG.getNode(ISD::FMUL, DL, VT, SqrtS, ScaleDownFactor, Flags);
13053
13054 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledDown, SqrtS, Flags);
13055 SDValue IsZeroOrInf =
13056 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
13057 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
13058
13059 return DAG.getNode(ISD::SELECT, DL, VT, IsZeroOrInf, SqrtX, SqrtS, Flags);
13060}
13061
13062SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const {
13063 // For double type, the SQRT and RSQ instructions don't have required
13064 // precision, we apply Goldschmidt's algorithm to improve the result:
13065 //
13066 // y0 = rsq(x)
13067 // g0 = x * y0
13068 // h0 = 0.5 * y0
13069 //
13070 // r0 = 0.5 - h0 * g0
13071 // g1 = g0 * r0 + g0
13072 // h1 = h0 * r0 + h0
13073 //
13074 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
13075 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
13076 // h2 = h1 * r1 + h1
13077 //
13078 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
13079 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
13080 //
13081 // sqrt(x) = g3
13082
13083 SDNodeFlags Flags = Op->getFlags();
13084
13085 SDLoc DL(Op);
13086
13087 SDValue X = Op.getOperand(0);
13088 SDValue ScaleConstant = DAG.getConstantFP(0x1.0p-767, DL, MVT::f64);
13089
13090 SDValue Scaling = DAG.getSetCC(DL, MVT::i1, X, ScaleConstant, ISD::SETOLT);
13091
13092 SDValue ZeroInt = DAG.getConstant(0, DL, MVT::i32);
13093
13094 // Scale up input if it is too small.
13095 SDValue ScaleUpFactor = DAG.getConstant(256, DL, MVT::i32);
13096 SDValue ScaleUp =
13097 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleUpFactor, ZeroInt);
13098 SDValue SqrtX = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, X, ScaleUp, Flags);
13099
13100 SDValue SqrtY = DAG.getNode(AMDGPUISD::RSQ, DL, MVT::f64, SqrtX);
13101
13102 SDValue SqrtS0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtX, SqrtY);
13103
13104 SDValue Half = DAG.getConstantFP(0.5, DL, MVT::f64);
13105 SDValue SqrtH0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtY, Half);
13106
13107 SDValue NegSqrtH0 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtH0);
13108 SDValue SqrtR0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtH0, SqrtS0, Half);
13109
13110 SDValue SqrtH1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtH0, SqrtR0, SqrtH0);
13111
13112 SDValue SqrtS1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtS0, SqrtR0, SqrtS0);
13113
13114 SDValue NegSqrtS1 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS1);
13115 SDValue SqrtD0 =
13116 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS1, SqrtS1, SqrtX);
13117
13118 SDValue SqrtS2 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD0, SqrtH1, SqrtS1);
13119
13120 SDValue NegSqrtS2 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS2);
13121 SDValue SqrtD1 =
13122 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS2, SqrtS2, SqrtX);
13123
13124 SDValue SqrtRet = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD1, SqrtH1, SqrtS2);
13125
13126 SDValue ScaleDownFactor = DAG.getSignedConstant(-128, DL, MVT::i32);
13127 SDValue ScaleDown =
13128 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleDownFactor, ZeroInt);
13129 SqrtRet = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, SqrtRet, ScaleDown, Flags);
13130
13131 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
13132 // with finite only or nsz because rsq(+/-0) = +/-inf
13133
13134 // TODO: Check for DAZ and expand to subnormals
13135 SDValue IsZeroOrInf =
13136 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
13137 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
13138
13139 // If x is +INF, +0, or -0, use its original value
13140 return DAG.getNode(ISD::SELECT, DL, MVT::f64, IsZeroOrInf, SqrtX, SqrtRet,
13141 Flags);
13142}
13143
13144SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
13145 SDLoc DL(Op);
13146 EVT VT = Op.getValueType();
13147 SDValue Arg = Op.getOperand(0);
13148 SDValue TrigVal;
13149
13150 // Propagate fast-math flags so that the multiply we introduce can be folded
13151 // if Arg is already the result of a multiply by constant.
13152 auto Flags = Op->getFlags();
13153
13154 // AMDGPUISD nodes of vector type must be unrolled here since
13155 // they will not be expanded elsewhere.
13156 auto UnrollIfVec = [&DAG](SDValue V) -> SDValue {
13157 if (!V.getValueType().isVector())
13158 return V;
13159
13160 return DAG.UnrollVectorOp(cast<SDNode>(V));
13161 };
13162
13163 SDValue OneOver2Pi = DAG.getConstantFP(0.5 * numbers::inv_pi, DL, VT);
13164
13165 if (Subtarget->hasTrigReducedRange()) {
13166 SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
13167 TrigVal = UnrollIfVec(DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal, Flags));
13168 } else {
13169 TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
13170 }
13171
13172 switch (Op.getOpcode()) {
13173 case ISD::FCOS:
13174 TrigVal = DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal, Flags);
13175 break;
13176 case ISD::FSIN:
13177 TrigVal = DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal, Flags);
13178 break;
13179 default:
13180 llvm_unreachable("Wrong trig opcode");
13181 }
13182
13183 return UnrollIfVec(TrigVal);
13184}
13185
13186SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op,
13187 SelectionDAG &DAG) const {
13188 AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
13189 assert(AtomicNode->isCompareAndSwap());
13190 unsigned AS = AtomicNode->getAddressSpace();
13191
13192 // No custom lowering required for local address space
13194 return Op;
13195
13196 // Non-local address space requires custom lowering for atomic compare
13197 // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
13198 SDLoc DL(Op);
13199 SDValue ChainIn = Op.getOperand(0);
13200 SDValue Addr = Op.getOperand(1);
13201 SDValue Old = Op.getOperand(2);
13202 SDValue New = Op.getOperand(3);
13203 EVT VT = Op.getValueType();
13204 MVT SimpleVT = VT.getSimpleVT();
13205 MVT VecType = MVT::getVectorVT(SimpleVT, 2);
13206
13207 SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
13208 SDValue Ops[] = {ChainIn, Addr, NewOld};
13209
13210 return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL,
13211 Op->getVTList(), Ops, VT,
13212 AtomicNode->getMemOperand());
13213}
13214
13215//===----------------------------------------------------------------------===//
13216// Custom DAG optimizations
13217//===----------------------------------------------------------------------===//
13218
13219SDValue
13220SITargetLowering::performUCharToFloatCombine(SDNode *N,
13221 DAGCombinerInfo &DCI) const {
13222 EVT VT = N->getValueType(0);
13223 EVT ScalarVT = VT.getScalarType();
13224 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
13225 return SDValue();
13226
13227 SelectionDAG &DAG = DCI.DAG;
13228 SDLoc DL(N);
13229
13230 SDValue Src = N->getOperand(0);
13231 EVT SrcVT = Src.getValueType();
13232
13233 // TODO: We could try to match extracting the higher bytes, which would be
13234 // easier if i8 vectors weren't promoted to i32 vectors, particularly after
13235 // types are legalized. v4i8 -> v4f32 is probably the only case to worry
13236 // about in practice.
13237 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
13238 if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
13239 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, MVT::f32, Src);
13240 DCI.AddToWorklist(Cvt.getNode());
13241
13242 // For the f16 case, fold to a cast to f32 and then cast back to f16.
13243 if (ScalarVT != MVT::f32) {
13244 Cvt = DAG.getNode(ISD::FP_ROUND, DL, VT, Cvt,
13245 DAG.getTargetConstant(0, DL, MVT::i32));
13246 }
13247 return Cvt;
13248 }
13249 }
13250
13251 return SDValue();
13252}
13253
13254SDValue SITargetLowering::performFCopySignCombine(SDNode *N,
13255 DAGCombinerInfo &DCI) const {
13256 SDValue MagnitudeOp = N->getOperand(0);
13257 SDValue SignOp = N->getOperand(1);
13258
13259 // The generic combine for fcopysign + fp cast is too conservative with
13260 // vectors, and also gets confused by the splitting we will perform here, so
13261 // peek through FP casts.
13262 if (SignOp.getOpcode() == ISD::FP_EXTEND ||
13263 SignOp.getOpcode() == ISD::FP_ROUND)
13264 SignOp = SignOp.getOperand(0);
13265
13266 SelectionDAG &DAG = DCI.DAG;
13267 SDLoc DL(N);
13268 EVT SignVT = SignOp.getValueType();
13269
13270 // f64 fcopysign is really an f32 copysign on the high bits, so replace the
13271 // lower half with a copy.
13272 // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
13273 EVT MagVT = MagnitudeOp.getValueType();
13274
13275 unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : 1;
13276
13277 if (MagVT.getScalarType() == MVT::f64) {
13278 EVT F32VT = MagVT.isVector()
13279 ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts)
13280 : MVT::v2f32;
13281
13282 SDValue MagAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, MagnitudeOp);
13283
13285 for (unsigned I = 0; I != NumElts; ++I) {
13286 SDValue MagLo =
13287 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
13288 DAG.getConstant(2 * I, DL, MVT::i32));
13289 SDValue MagHi =
13290 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
13291 DAG.getConstant(2 * I + 1, DL, MVT::i32));
13292
13293 SDValue SignOpElt =
13294 MagVT.isVector()
13296 SignOp, DAG.getConstant(I, DL, MVT::i32))
13297 : SignOp;
13298
13299 SDValue HiOp =
13300 DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOpElt);
13301
13302 SDValue Vector =
13303 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp);
13304
13305 SDValue NewElt = DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector);
13306 NewElts.push_back(NewElt);
13307 }
13308
13309 if (NewElts.size() == 1)
13310 return NewElts[0];
13311
13312 return DAG.getNode(ISD::BUILD_VECTOR, DL, MagVT, NewElts);
13313 }
13314
13315 if (SignVT.getScalarType() != MVT::f64)
13316 return SDValue();
13317
13318 // Reduce width of sign operand, we only need the highest bit.
13319 //
13320 // fcopysign f64:x, f64:y ->
13321 // fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1)
13322 // TODO: In some cases it might make sense to go all the way to f16.
13323
13324 EVT F32VT = MagVT.isVector()
13325 ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts)
13326 : MVT::v2f32;
13327
13328 SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, SignOp);
13329
13330 SmallVector<SDValue, 8> F32Signs;
13331 for (unsigned I = 0; I != NumElts; ++I) {
13332 // Take sign from odd elements of cast vector
13333 SDValue SignAsF32 =
13334 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector,
13335 DAG.getConstant(2 * I + 1, DL, MVT::i32));
13336 F32Signs.push_back(SignAsF32);
13337 }
13338
13339 SDValue NewSign =
13340 NumElts == 1
13341 ? F32Signs.back()
13343 EVT::getVectorVT(*DAG.getContext(), MVT::f32, NumElts),
13344 F32Signs);
13345
13346 return DAG.getNode(ISD::FCOPYSIGN, DL, N->getValueType(0), N->getOperand(0),
13347 NewSign);
13348}
13349
13350// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
13351// (shl (or x, c1), c2) -> add (shl x, c2), (shl c1, c2) iff x and c1 share no
13352// bits
13353
13354// This is a variant of
13355// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
13356//
13357// The normal DAG combiner will do this, but only if the add has one use since
13358// that would increase the number of instructions.
13359//
13360// This prevents us from seeing a constant offset that can be folded into a
13361// memory instruction's addressing mode. If we know the resulting add offset of
13362// a pointer can be folded into an addressing offset, we can replace the pointer
13363// operand with the add of new constant offset. This eliminates one of the uses,
13364// and may allow the remaining use to also be simplified.
13365//
13366SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, unsigned AddrSpace,
13367 EVT MemVT,
13368 DAGCombinerInfo &DCI) const {
13369 SDValue N0 = N->getOperand(0);
13370 SDValue N1 = N->getOperand(1);
13371
13372 // We only do this to handle cases where it's profitable when there are
13373 // multiple uses of the add, so defer to the standard combine.
13374 if ((!N0->isAnyAdd() && N0.getOpcode() != ISD::OR) || N0->hasOneUse())
13375 return SDValue();
13376
13377 const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
13378 if (!CN1)
13379 return SDValue();
13380
13381 const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
13382 if (!CAdd)
13383 return SDValue();
13384
13385 SelectionDAG &DAG = DCI.DAG;
13386
13387 if (N0->getOpcode() == ISD::OR &&
13388 !DAG.haveNoCommonBitsSet(N0.getOperand(0), N0.getOperand(1)))
13389 return SDValue();
13390
13391 // If the resulting offset is too large, we can't fold it into the
13392 // addressing mode offset.
13393 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
13394 Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext());
13395
13396 AddrMode AM;
13397 AM.HasBaseReg = true;
13398 AM.BaseOffs = Offset.getSExtValue();
13399 if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace))
13400 return SDValue();
13401
13402 SDLoc SL(N);
13403 EVT VT = N->getValueType(0);
13404
13405 SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
13406 SDValue COffset = DAG.getConstant(Offset, SL, VT);
13407
13408 SDNodeFlags Flags;
13409 Flags.setNoUnsignedWrap(
13410 N->getFlags().hasNoUnsignedWrap() &&
13411 (N0.getOpcode() == ISD::OR || N0->getFlags().hasNoUnsignedWrap()));
13412
13413 // Use ISD::ADD even if the original operation was ISD::PTRADD, since we can't
13414 // be sure that the new left operand is a proper base pointer.
13415 return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
13416}
13417
13418/// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset
13419/// by the chain and intrinsic ID. Theoretically we would also need to check the
13420/// specific intrinsic, but they all place the pointer operand first.
13421static unsigned getBasePtrIndex(const MemSDNode *N) {
13422 switch (N->getOpcode()) {
13423 case ISD::STORE:
13426 return 2;
13427 default:
13428 return 1;
13429 }
13430}
13431
13432SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
13433 DAGCombinerInfo &DCI) const {
13434 SelectionDAG &DAG = DCI.DAG;
13435
13436 unsigned PtrIdx = getBasePtrIndex(N);
13437 SDValue Ptr = N->getOperand(PtrIdx);
13438
13439 // TODO: We could also do this for multiplies.
13440 if (Ptr.getOpcode() == ISD::SHL) {
13441 SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(),
13442 N->getMemoryVT(), DCI);
13443 if (NewPtr) {
13444 SmallVector<SDValue, 8> NewOps(N->ops());
13445
13446 NewOps[PtrIdx] = NewPtr;
13447 return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
13448 }
13449 }
13450
13451 return SDValue();
13452}
13453
13454static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
13455 return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
13456 (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
13457 (Opc == ISD::XOR && Val == 0);
13458}
13459
13460// Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
13461// will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
13462// integer combine opportunities since most 64-bit operations are decomposed
13463// this way. TODO: We won't want this for SALU especially if it is an inline
13464// immediate.
13465SDValue SITargetLowering::splitBinaryBitConstantOp(
13466 DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS,
13467 const ConstantSDNode *CRHS) const {
13468 uint64_t Val = CRHS->getZExtValue();
13469 uint32_t ValLo = Lo_32(Val);
13470 uint32_t ValHi = Hi_32(Val);
13471 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
13472
13473 if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
13475 (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
13476 // We have 64-bit scalar and/or/xor, but do not have vector forms.
13477 if (Subtarget->has64BitLiterals() && CRHS->hasOneUse() &&
13478 !CRHS->user_begin()->isDivergent())
13479 return SDValue();
13480
13481 // If we need to materialize a 64-bit immediate, it will be split up later
13482 // anyway. Avoid creating the harder to understand 64-bit immediate
13483 // materialization.
13484 return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
13485 }
13486
13487 return SDValue();
13488}
13489
13491 if (V.getValueType() != MVT::i1)
13492 return false;
13493 switch (V.getOpcode()) {
13494 default:
13495 break;
13496 case ISD::SETCC:
13497 case ISD::IS_FPCLASS:
13498 case AMDGPUISD::FP_CLASS:
13499 return true;
13500 case ISD::AND:
13501 case ISD::OR:
13502 case ISD::XOR:
13503 return isBoolSGPR(V.getOperand(0)) && isBoolSGPR(V.getOperand(1));
13504 case ISD::SADDO:
13505 case ISD::UADDO:
13506 case ISD::SSUBO:
13507 case ISD::USUBO:
13508 case ISD::SMULO:
13509 case ISD::UMULO:
13510 return V.getResNo() == 1;
13512 unsigned IntrinsicID = V.getConstantOperandVal(0);
13513 switch (IntrinsicID) {
13514 case Intrinsic::amdgcn_is_shared:
13515 case Intrinsic::amdgcn_is_private:
13516 return true;
13517 default:
13518 return false;
13519 }
13520
13521 return false;
13522 }
13523 }
13524 return false;
13525}
13526
13527// If a constant has all zeroes or all ones within each byte return it.
13528// Otherwise return 0.
13530 // 0xff for any zero byte in the mask
13531 uint32_t ZeroByteMask = 0;
13532 if (!(C & 0x000000ff))
13533 ZeroByteMask |= 0x000000ff;
13534 if (!(C & 0x0000ff00))
13535 ZeroByteMask |= 0x0000ff00;
13536 if (!(C & 0x00ff0000))
13537 ZeroByteMask |= 0x00ff0000;
13538 if (!(C & 0xff000000))
13539 ZeroByteMask |= 0xff000000;
13540 uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
13541 if ((NonZeroByteMask & C) != NonZeroByteMask)
13542 return 0; // Partial bytes selected.
13543 return C;
13544}
13545
13546// Check if a node selects whole bytes from its operand 0 starting at a byte
13547// boundary while masking the rest. Returns select mask as in the v_perm_b32
13548// or -1 if not succeeded.
13549// Note byte select encoding:
13550// value 0-3 selects corresponding source byte;
13551// value 0xc selects zero;
13552// value 0xff selects 0xff.
13554 assert(V.getValueSizeInBits() == 32);
13555
13556 if (V.getNumOperands() != 2)
13557 return ~0;
13558
13559 ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(V.getOperand(1));
13560 if (!N1)
13561 return ~0;
13562
13563 uint32_t C = N1->getZExtValue();
13564
13565 switch (V.getOpcode()) {
13566 default:
13567 break;
13568 case ISD::AND:
13569 if (uint32_t ConstMask = getConstantPermuteMask(C))
13570 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
13571 break;
13572
13573 case ISD::OR:
13574 if (uint32_t ConstMask = getConstantPermuteMask(C))
13575 return (0x03020100 & ~ConstMask) | ConstMask;
13576 break;
13577
13578 case ISD::SHL:
13579 if (C % 8)
13580 return ~0;
13581
13582 return uint32_t((0x030201000c0c0c0cull << C) >> 32);
13583
13584 case ISD::SRL:
13585 if (C % 8)
13586 return ~0;
13587
13588 return uint32_t(0x0c0c0c0c03020100ull >> C);
13589 }
13590
13591 return ~0;
13592}
13593
13594SDValue SITargetLowering::performAndCombine(SDNode *N,
13595 DAGCombinerInfo &DCI) const {
13596 if (DCI.isBeforeLegalize())
13597 return SDValue();
13598
13599 SelectionDAG &DAG = DCI.DAG;
13600 EVT VT = N->getValueType(0);
13601 SDValue LHS = N->getOperand(0);
13602 SDValue RHS = N->getOperand(1);
13603
13604 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
13605 if (VT == MVT::i64 && CRHS) {
13606 if (SDValue Split =
13607 splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
13608 return Split;
13609 }
13610
13611 if (CRHS && VT == MVT::i32) {
13612 // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
13613 // nb = number of trailing zeroes in mask
13614 // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
13615 // given that we are selecting 8 or 16 bit fields starting at byte boundary.
13616 uint64_t Mask = CRHS->getZExtValue();
13617 unsigned Bits = llvm::popcount(Mask);
13618 if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
13619 (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {
13620 if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
13621 unsigned Shift = CShift->getZExtValue();
13622 unsigned NB = CRHS->getAPIntValue().countr_zero();
13623 unsigned Offset = NB + Shift;
13624 if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
13625 SDLoc SL(N);
13626 SDValue BFE =
13627 DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32, LHS->getOperand(0),
13628 DAG.getConstant(Offset, SL, MVT::i32),
13629 DAG.getConstant(Bits, SL, MVT::i32));
13630 EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
13631 SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,
13632 DAG.getValueType(NarrowVT));
13633 SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
13634 DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
13635 return Shl;
13636 }
13637 }
13638 }
13639
13640 // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
13641 if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
13642 isa<ConstantSDNode>(LHS.getOperand(2))) {
13643 uint32_t Sel = getConstantPermuteMask(Mask);
13644 if (!Sel)
13645 return SDValue();
13646
13647 // Select 0xc for all zero bytes
13648 Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
13649 SDLoc DL(N);
13650 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
13651 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
13652 }
13653 }
13654
13655 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
13656 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
13657 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
13658 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
13659 ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
13660
13661 SDValue X = LHS.getOperand(0);
13662 SDValue Y = RHS.getOperand(0);
13663 if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X ||
13664 !isTypeLegal(X.getValueType()))
13665 return SDValue();
13666
13667 if (LCC == ISD::SETO) {
13668 if (X != LHS.getOperand(1))
13669 return SDValue();
13670
13671 if (RCC == ISD::SETUNE) {
13672 const ConstantFPSDNode *C1 =
13673 dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
13674 if (!C1 || !C1->isInfinity() || C1->isNegative())
13675 return SDValue();
13676
13677 const uint32_t Mask = SIInstrFlags::N_NORMAL |
13681
13682 static_assert(
13685 0x3ff) == Mask,
13686 "mask not equal");
13687
13688 SDLoc DL(N);
13689 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, X,
13690 DAG.getConstant(Mask, DL, MVT::i32));
13691 }
13692 }
13693 }
13694
13695 if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS)
13696 std::swap(LHS, RHS);
13697
13698 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
13699 RHS.hasOneUse()) {
13700 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
13701 // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan |
13702 // n_nan) and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan
13703 // | n_nan)
13704 const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
13705 if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask &&
13706 (RHS.getOperand(0) == LHS.getOperand(0) &&
13707 LHS.getOperand(0) == LHS.getOperand(1))) {
13708 const unsigned OrdMask = SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN;
13709 unsigned NewMask = LCC == ISD::SETO ? Mask->getZExtValue() & ~OrdMask
13710 : Mask->getZExtValue() & OrdMask;
13711
13712 SDLoc DL(N);
13713 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, RHS.getOperand(0),
13714 DAG.getConstant(NewMask, DL, MVT::i32));
13715 }
13716 }
13717
13718 if (VT == MVT::i32 && (RHS.getOpcode() == ISD::SIGN_EXTEND ||
13719 LHS.getOpcode() == ISD::SIGN_EXTEND)) {
13720 // and x, (sext cc from i1) => select cc, x, 0
13721 if (RHS.getOpcode() != ISD::SIGN_EXTEND)
13722 std::swap(LHS, RHS);
13723 if (isBoolSGPR(RHS.getOperand(0)))
13724 return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0), LHS,
13725 DAG.getConstant(0, SDLoc(N), MVT::i32));
13726 }
13727
13728 // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
13729 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
13730 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
13731 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
13732 uint32_t LHSMask = getPermuteMask(LHS);
13733 uint32_t RHSMask = getPermuteMask(RHS);
13734 if (LHSMask != ~0u && RHSMask != ~0u) {
13735 // Canonicalize the expression in an attempt to have fewer unique masks
13736 // and therefore fewer registers used to hold the masks.
13737 if (LHSMask > RHSMask) {
13738 std::swap(LHSMask, RHSMask);
13739 std::swap(LHS, RHS);
13740 }
13741
13742 // Select 0xc for each lane used from source operand. Zero has 0xc mask
13743 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
13744 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13745 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13746
13747 // Check of we need to combine values from two sources within a byte.
13748 if (!(LHSUsedLanes & RHSUsedLanes) &&
13749 // If we select high and lower word keep it for SDWA.
13750 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
13751 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
13752 // Each byte in each mask is either selector mask 0-3, or has higher
13753 // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
13754 // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
13755 // mask which is not 0xff wins. By anding both masks we have a correct
13756 // result except that 0x0c shall be corrected to give 0x0c only.
13757 uint32_t Mask = LHSMask & RHSMask;
13758 for (unsigned I = 0; I < 32; I += 8) {
13759 uint32_t ByteSel = 0xff << I;
13760 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
13761 Mask &= (0x0c << I) & 0xffffffff;
13762 }
13763
13764 // Add 4 to each active LHS lane. It will not affect any existing 0xff
13765 // or 0x0c.
13766 uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
13767 SDLoc DL(N);
13768
13769 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
13770 RHS.getOperand(0),
13771 DAG.getConstant(Sel, DL, MVT::i32));
13772 }
13773 }
13774 }
13775
13776 return SDValue();
13777}
13778
13779// A key component of v_perm is a mapping between byte position of the src
13780// operands, and the byte position of the dest. To provide such, we need: 1. the
13781// node that provides x byte of the dest of the OR, and 2. the byte of the node
13782// used to provide that x byte. calculateByteProvider finds which node provides
13783// a certain byte of the dest of the OR, and calculateSrcByte takes that node,
13784// and finds an ultimate src and byte position For example: The supported
13785// LoadCombine pattern for vector loads is as follows
13786// t1
13787// or
13788// / \
13789// t2 t3
13790// zext shl
13791// | | \
13792// t4 t5 16
13793// or anyext
13794// / \ |
13795// t6 t7 t8
13796// srl shl or
13797// / | / \ / \
13798// t9 t10 t11 t12 t13 t14
13799// trunc* 8 trunc* 8 and and
13800// | | / | | \
13801// t15 t16 t17 t18 t19 t20
13802// trunc* 255 srl -256
13803// | / \
13804// t15 t15 16
13805//
13806// *In this example, the truncs are from i32->i16
13807//
13808// calculateByteProvider would find t6, t7, t13, and t14 for bytes 0-3
13809// respectively. calculateSrcByte would find (given node) -> ultimate src &
13810// byteposition: t6 -> t15 & 1, t7 -> t16 & 0, t13 -> t15 & 0, t14 -> t15 & 3.
13811// After finding the mapping, we can combine the tree into vperm t15, t16,
13812// 0x05000407
13813
13814// Find the source and byte position from a node.
13815// \p DestByte is the byte position of the dest of the or that the src
13816// ultimately provides. \p SrcIndex is the byte of the src that maps to this
13817// dest of the or byte. \p Depth tracks how many recursive iterations we have
13818// performed.
13819static const std::optional<ByteProvider<SDValue>>
13820calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
13821 unsigned Depth = 0) {
13822 // We may need to recursively traverse a series of SRLs
13823 if (Depth >= 6)
13824 return std::nullopt;
13825
13826 if (Op.getValueSizeInBits() < 8)
13827 return std::nullopt;
13828
13829 if (Op.getValueType().isVector())
13830 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
13831
13832 switch (Op->getOpcode()) {
13833 case ISD::TRUNCATE: {
13834 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
13835 }
13836
13837 case ISD::ANY_EXTEND:
13838 case ISD::SIGN_EXTEND:
13839 case ISD::ZERO_EXTEND:
13841 SDValue NarrowOp = Op->getOperand(0);
13842 auto NarrowVT = NarrowOp.getValueType();
13843 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG) {
13844 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
13845 NarrowVT = VTSign->getVT();
13846 }
13847 if (!NarrowVT.isByteSized())
13848 return std::nullopt;
13849 uint64_t NarrowByteWidth = NarrowVT.getStoreSize();
13850
13851 if (SrcIndex >= NarrowByteWidth)
13852 return std::nullopt;
13853 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
13854 }
13855
13856 case ISD::SRA:
13857 case ISD::SRL: {
13858 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13859 if (!ShiftOp)
13860 return std::nullopt;
13861
13862 uint64_t BitShift = ShiftOp->getZExtValue();
13863
13864 if (BitShift % 8 != 0)
13865 return std::nullopt;
13866
13867 SrcIndex += BitShift / 8;
13868
13869 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
13870 }
13871
13872 default: {
13873 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
13874 }
13875 }
13876 llvm_unreachable("fully handled switch");
13877}
13878
13879// For a byte position in the result of an Or, traverse the tree and find the
13880// node (and the byte of the node) which ultimately provides this {Or,
13881// BytePosition}. \p Op is the operand we are currently examining. \p Index is
13882// the byte position of the Op that corresponds with the originally requested
13883// byte of the Or \p Depth tracks how many recursive iterations we have
13884// performed. \p StartingIndex is the originally requested byte of the Or
13885static const std::optional<ByteProvider<SDValue>>
13886calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
13887 unsigned StartingIndex = 0) {
13888 // Finding Src tree of RHS of or typically requires at least 1 additional
13889 // depth
13890 if (Depth > 6)
13891 return std::nullopt;
13892
13893 unsigned BitWidth = Op.getScalarValueSizeInBits();
13894 if (BitWidth % 8 != 0)
13895 return std::nullopt;
13896 if (Index > BitWidth / 8 - 1)
13897 return std::nullopt;
13898
13899 bool IsVec = Op.getValueType().isVector();
13900 switch (Op.getOpcode()) {
13901 case ISD::OR: {
13902 if (IsVec)
13903 return std::nullopt;
13904
13905 auto RHS = calculateByteProvider(Op.getOperand(1), Index, Depth + 1,
13906 StartingIndex);
13907 if (!RHS)
13908 return std::nullopt;
13909 auto LHS = calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
13910 StartingIndex);
13911 if (!LHS)
13912 return std::nullopt;
13913 // A well formed Or will have two ByteProviders for each byte, one of which
13914 // is constant zero
13915 if (!LHS->isConstantZero() && !RHS->isConstantZero())
13916 return std::nullopt;
13917 if (!LHS || LHS->isConstantZero())
13918 return RHS;
13919 if (!RHS || RHS->isConstantZero())
13920 return LHS;
13921 return std::nullopt;
13922 }
13923
13924 case ISD::AND: {
13925 if (IsVec)
13926 return std::nullopt;
13927
13928 auto *BitMaskOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13929 if (!BitMaskOp)
13930 return std::nullopt;
13931
13932 uint32_t BitMask = BitMaskOp->getZExtValue();
13933 // Bits we expect for our StartingIndex
13934 uint32_t IndexMask = 0xFF << (Index * 8);
13935
13936 if ((IndexMask & BitMask) != IndexMask) {
13937 // If the result of the and partially provides the byte, then it
13938 // is not well formatted
13939 if (IndexMask & BitMask)
13940 return std::nullopt;
13942 }
13943
13944 return calculateSrcByte(Op->getOperand(0), StartingIndex, Index);
13945 }
13946
13947 case ISD::FSHR: {
13948 if (IsVec)
13949 return std::nullopt;
13950
13951 // fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW))
13952 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(2));
13953 if (!ShiftOp || Op.getValueType().isVector())
13954 return std::nullopt;
13955
13956 uint64_t BitsProvided = Op.getValueSizeInBits();
13957 if (BitsProvided % 8 != 0)
13958 return std::nullopt;
13959
13960 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
13961 if (BitShift % 8)
13962 return std::nullopt;
13963
13964 uint64_t ConcatSizeInBytes = BitsProvided / 4;
13965 uint64_t ByteShift = BitShift / 8;
13966
13967 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
13968 uint64_t BytesProvided = BitsProvided / 8;
13969 SDValue NextOp = Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
13970 NewIndex %= BytesProvided;
13971 return calculateByteProvider(NextOp, NewIndex, Depth + 1, StartingIndex);
13972 }
13973
13974 case ISD::SRA:
13975 case ISD::SRL: {
13976 if (IsVec)
13977 return std::nullopt;
13978
13979 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13980 if (!ShiftOp)
13981 return std::nullopt;
13982
13983 uint64_t BitShift = ShiftOp->getZExtValue();
13984 if (BitShift % 8)
13985 return std::nullopt;
13986
13987 auto BitsProvided = Op.getScalarValueSizeInBits();
13988 if (BitsProvided % 8 != 0)
13989 return std::nullopt;
13990
13991 uint64_t BytesProvided = BitsProvided / 8;
13992 uint64_t ByteShift = BitShift / 8;
13993 // The dest of shift will have good [0 : (BytesProvided - ByteShift)] bytes.
13994 // If the byte we are trying to provide (as tracked by index) falls in this
13995 // range, then the SRL provides the byte. The byte of interest of the src of
13996 // the SRL is Index + ByteShift
13997 return BytesProvided - ByteShift > Index
13998 ? calculateSrcByte(Op->getOperand(0), StartingIndex,
13999 Index + ByteShift)
14001 }
14002
14003 case ISD::SHL: {
14004 if (IsVec)
14005 return std::nullopt;
14006
14007 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
14008 if (!ShiftOp)
14009 return std::nullopt;
14010
14011 uint64_t BitShift = ShiftOp->getZExtValue();
14012 if (BitShift % 8 != 0)
14013 return std::nullopt;
14014 uint64_t ByteShift = BitShift / 8;
14015
14016 // If we are shifting by an amount greater than (or equal to)
14017 // the index we are trying to provide, then it provides 0s. If not,
14018 // then this bytes are not definitively 0s, and the corresponding byte
14019 // of interest is Index - ByteShift of the src
14020 return Index < ByteShift
14022 : calculateByteProvider(Op.getOperand(0), Index - ByteShift,
14023 Depth + 1, StartingIndex);
14024 }
14025 case ISD::ANY_EXTEND:
14026 case ISD::SIGN_EXTEND:
14027 case ISD::ZERO_EXTEND:
14029 case ISD::AssertZext:
14030 case ISD::AssertSext: {
14031 if (IsVec)
14032 return std::nullopt;
14033
14034 SDValue NarrowOp = Op->getOperand(0);
14035 unsigned NarrowBitWidth = NarrowOp.getValueSizeInBits();
14036 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG ||
14037 Op->getOpcode() == ISD::AssertZext ||
14038 Op->getOpcode() == ISD::AssertSext) {
14039 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
14040 NarrowBitWidth = VTSign->getVT().getSizeInBits();
14041 }
14042 if (NarrowBitWidth % 8 != 0)
14043 return std::nullopt;
14044 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
14045
14046 if (Index >= NarrowByteWidth)
14047 return Op.getOpcode() == ISD::ZERO_EXTEND
14048 ? std::optional<ByteProvider<SDValue>>(
14050 : std::nullopt;
14051 return calculateByteProvider(NarrowOp, Index, Depth + 1, StartingIndex);
14052 }
14053
14054 case ISD::TRUNCATE: {
14055 if (IsVec)
14056 return std::nullopt;
14057
14058 uint64_t NarrowByteWidth = BitWidth / 8;
14059
14060 if (NarrowByteWidth >= Index) {
14061 return calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
14062 StartingIndex);
14063 }
14064
14065 return std::nullopt;
14066 }
14067
14068 case ISD::CopyFromReg: {
14069 if (BitWidth / 8 > Index)
14070 return calculateSrcByte(Op, StartingIndex, Index);
14071
14072 return std::nullopt;
14073 }
14074
14075 case ISD::LOAD: {
14076 auto *L = cast<LoadSDNode>(Op.getNode());
14077
14078 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
14079 if (NarrowBitWidth % 8 != 0)
14080 return std::nullopt;
14081 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
14082
14083 // If the width of the load does not reach byte we are trying to provide for
14084 // and it is not a ZEXTLOAD, then the load does not provide for the byte in
14085 // question
14086 if (Index >= NarrowByteWidth) {
14087 return L->getExtensionType() == ISD::ZEXTLOAD
14088 ? std::optional<ByteProvider<SDValue>>(
14090 : std::nullopt;
14091 }
14092
14093 if (NarrowByteWidth > Index) {
14094 return calculateSrcByte(Op, StartingIndex, Index);
14095 }
14096
14097 return std::nullopt;
14098 }
14099
14100 case ISD::BSWAP: {
14101 if (IsVec)
14102 return std::nullopt;
14103
14104 return calculateByteProvider(Op->getOperand(0), BitWidth / 8 - Index - 1,
14105 Depth + 1, StartingIndex);
14106 }
14107
14109 auto *IdxOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
14110 if (!IdxOp)
14111 return std::nullopt;
14112 auto VecIdx = IdxOp->getZExtValue();
14113 auto ScalarSize = Op.getScalarValueSizeInBits();
14114 if (ScalarSize < 32)
14115 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
14116 return calculateSrcByte(ScalarSize >= 32 ? Op : Op.getOperand(0),
14117 StartingIndex, Index);
14118 }
14119
14120 case AMDGPUISD::PERM: {
14121 if (IsVec)
14122 return std::nullopt;
14123
14124 auto *PermMask = dyn_cast<ConstantSDNode>(Op->getOperand(2));
14125 if (!PermMask)
14126 return std::nullopt;
14127
14128 auto IdxMask =
14129 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
14130 if (IdxMask > 0x07 && IdxMask != 0x0c)
14131 return std::nullopt;
14132
14133 auto NextOp = Op.getOperand(IdxMask > 0x03 ? 0 : 1);
14134 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
14135
14136 return IdxMask != 0x0c ? calculateSrcByte(NextOp, StartingIndex, NextIndex)
14139 }
14140
14141 default: {
14142 return std::nullopt;
14143 }
14144 }
14145
14146 llvm_unreachable("fully handled switch");
14147}
14148
14149// Returns true if the Operand is a scalar and is 16 bits
14150static bool isExtendedFrom16Bits(SDValue &Operand) {
14151
14152 switch (Operand.getOpcode()) {
14153 case ISD::ANY_EXTEND:
14154 case ISD::SIGN_EXTEND:
14155 case ISD::ZERO_EXTEND: {
14156 auto OpVT = Operand.getOperand(0).getValueType();
14157 return !OpVT.isVector() && OpVT.getSizeInBits() == 16;
14158 }
14159 case ISD::LOAD: {
14160 LoadSDNode *L = cast<LoadSDNode>(Operand.getNode());
14161 auto ExtType = cast<LoadSDNode>(L)->getExtensionType();
14162 if (ExtType == ISD::ZEXTLOAD || ExtType == ISD::SEXTLOAD ||
14163 ExtType == ISD::EXTLOAD) {
14164 auto MemVT = L->getMemoryVT();
14165 return !MemVT.isVector() && MemVT.getSizeInBits() == 16;
14166 }
14167 return L->getMemoryVT().getSizeInBits() == 16;
14168 }
14169 default:
14170 return false;
14171 }
14172}
14173
14174// Returns true if the mask matches consecutive bytes, and the first byte
14175// begins at a power of 2 byte offset from 0th byte
14176static bool addresses16Bits(int Mask) {
14177 int Low8 = Mask & 0xff;
14178 int Hi8 = (Mask & 0xff00) >> 8;
14179
14180 assert(Low8 < 8 && Hi8 < 8);
14181 // Are the bytes contiguous in the order of increasing addresses.
14182 bool IsConsecutive = (Hi8 - Low8 == 1);
14183 // Is the first byte at location that is aligned for 16 bit instructions.
14184 // A counter example is taking 2 consecutive bytes starting at the 8th bit.
14185 // In this case, we still need code to extract the 16 bit operand, so it
14186 // is better to use i8 v_perm
14187 bool Is16Aligned = !(Low8 % 2);
14188
14189 return IsConsecutive && Is16Aligned;
14190}
14191
14192// Do not lower into v_perm if the operands are actually 16 bit
14193// and the selected bits (based on PermMask) correspond with two
14194// easily addressable 16 bit operands.
14196 SDValue &OtherOp) {
14197 int Low16 = PermMask & 0xffff;
14198 int Hi16 = (PermMask & 0xffff0000) >> 16;
14199
14200 auto TempOp = peekThroughBitcasts(Op);
14201 auto TempOtherOp = peekThroughBitcasts(OtherOp);
14202
14203 auto OpIs16Bit =
14204 TempOtherOp.getValueSizeInBits() == 16 || isExtendedFrom16Bits(TempOp);
14205 if (!OpIs16Bit)
14206 return true;
14207
14208 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
14209 isExtendedFrom16Bits(TempOtherOp);
14210 if (!OtherOpIs16Bit)
14211 return true;
14212
14213 // Do we cleanly address both
14214 return !addresses16Bits(Low16) || !addresses16Bits(Hi16);
14215}
14216
14218 unsigned DWordOffset) {
14219 SDValue Ret;
14220
14221 auto TypeSize = Src.getValueSizeInBits().getFixedValue();
14222 // ByteProvider must be at least 8 bits
14223 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
14224
14225 if (TypeSize <= 32)
14226 return DAG.getBitcastedAnyExtOrTrunc(Src, SL, MVT::i32);
14227
14228 if (Src.getValueType().isVector()) {
14229 auto ScalarTySize = Src.getScalarValueSizeInBits();
14230 auto ScalarTy = Src.getValueType().getScalarType();
14231 if (ScalarTySize == 32) {
14232 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Src,
14233 DAG.getConstant(DWordOffset, SL, MVT::i32));
14234 }
14235 if (ScalarTySize > 32) {
14236 Ret = DAG.getNode(
14237 ISD::EXTRACT_VECTOR_ELT, SL, ScalarTy, Src,
14238 DAG.getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
14239 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
14240 if (ShiftVal)
14241 Ret = DAG.getNode(ISD::SRL, SL, Ret.getValueType(), Ret,
14242 DAG.getConstant(ShiftVal, SL, MVT::i32));
14243 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
14244 }
14245
14246 assert(ScalarTySize < 32);
14247 auto NumElements = TypeSize / ScalarTySize;
14248 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
14249 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
14250 auto NumElementsIn32 = 32 / ScalarTySize;
14251 auto NumAvailElements = DWordOffset < Trunc32Elements
14252 ? NumElementsIn32
14253 : NumElements - NormalizedTrunc;
14254
14256 DAG.ExtractVectorElements(Src, VecSrcs, DWordOffset * NumElementsIn32,
14257 NumAvailElements);
14258
14259 Ret = DAG.getBuildVector(
14260 MVT::getVectorVT(MVT::getIntegerVT(ScalarTySize), NumAvailElements), SL,
14261 VecSrcs);
14262 return Ret = DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
14263 }
14264
14265 /// Scalar Type
14266 auto ShiftVal = 32 * DWordOffset;
14267 Ret = DAG.getNode(ISD::SRL, SL, Src.getValueType(), Src,
14268 DAG.getConstant(ShiftVal, SL, MVT::i32));
14269 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
14270}
14271
14273 SelectionDAG &DAG = DCI.DAG;
14274 [[maybe_unused]] EVT VT = N->getValueType(0);
14276
14277 // VT is known to be MVT::i32, so we need to provide 4 bytes.
14278 assert(VT == MVT::i32);
14279 for (int i = 0; i < 4; i++) {
14280 // Find the ByteProvider that provides the ith byte of the result of OR
14281 std::optional<ByteProvider<SDValue>> P =
14282 calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i);
14283 // TODO support constantZero
14284 if (!P || P->isConstantZero())
14285 return SDValue();
14286
14287 PermNodes.push_back(*P);
14288 }
14289 if (PermNodes.size() != 4)
14290 return SDValue();
14291
14292 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
14293 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
14294 uint64_t PermMask = 0x00000000;
14295 for (size_t i = 0; i < PermNodes.size(); i++) {
14296 auto PermOp = PermNodes[i];
14297 // Since the mask is applied to Src1:Src2, Src1 bytes must be offset
14298 // by sizeof(Src2) = 4
14299 int SrcByteAdjust = 4;
14300
14301 // If the Src uses a byte from a different DWORD, then it corresponds
14302 // with a difference source
14303 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
14304 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
14305 if (SecondSrc)
14306 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
14307 ((PermOp.SrcOffset / 4) != SecondSrc->second))
14308 return SDValue();
14309
14310 // Set the index of the second distinct Src node
14311 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
14312 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
14313 SrcByteAdjust = 0;
14314 }
14315 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
14317 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
14318 }
14319 SDLoc DL(N);
14320 SDValue Op = *PermNodes[FirstSrc.first].Src;
14321 Op = getDWordFromOffset(DAG, DL, Op, FirstSrc.second);
14322 assert(Op.getValueSizeInBits() == 32);
14323
14324 // Check that we are not just extracting the bytes in order from an op
14325 if (!SecondSrc) {
14326 int Low16 = PermMask & 0xffff;
14327 int Hi16 = (PermMask & 0xffff0000) >> 16;
14328
14329 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
14330 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
14331
14332 // The perm op would really just produce Op. So combine into Op
14333 if (WellFormedLow && WellFormedHi)
14334 return DAG.getBitcast(MVT::getIntegerVT(32), Op);
14335 }
14336
14337 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src : Op;
14338
14339 if (SecondSrc) {
14340 OtherOp = getDWordFromOffset(DAG, DL, OtherOp, SecondSrc->second);
14341 assert(OtherOp.getValueSizeInBits() == 32);
14342 }
14343
14344 // Check that we haven't just recreated the same FSHR node.
14345 if (N->getOpcode() == ISD::FSHR &&
14346 (N->getOperand(0) == Op || N->getOperand(0) == OtherOp) &&
14347 (N->getOperand(1) == Op || N->getOperand(1) == OtherOp))
14348 return SDValue();
14349
14350 if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
14351
14352 assert(Op.getValueType().isByteSized() &&
14353 OtherOp.getValueType().isByteSized());
14354
14355 // If the ultimate src is less than 32 bits, then we will only be
14356 // using bytes 0: Op.getValueSizeInBytes() - 1 in the or.
14357 // CalculateByteProvider would not have returned Op as source if we
14358 // used a byte that is outside its ValueType. Thus, we are free to
14359 // ANY_EXTEND as the extended bits are dont-cares.
14360 Op = DAG.getBitcastedAnyExtOrTrunc(Op, DL, MVT::i32);
14361 OtherOp = DAG.getBitcastedAnyExtOrTrunc(OtherOp, DL, MVT::i32);
14362
14363 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp,
14364 DAG.getConstant(PermMask, DL, MVT::i32));
14365 }
14366 return SDValue();
14367}
14368
14369SDValue SITargetLowering::performOrCombine(SDNode *N,
14370 DAGCombinerInfo &DCI) const {
14371 SelectionDAG &DAG = DCI.DAG;
14372 SDValue LHS = N->getOperand(0);
14373 SDValue RHS = N->getOperand(1);
14374
14375 EVT VT = N->getValueType(0);
14376 if (VT == MVT::i1) {
14377 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
14378 if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
14379 RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
14380 SDValue Src = LHS.getOperand(0);
14381 if (Src != RHS.getOperand(0))
14382 return SDValue();
14383
14384 const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
14385 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
14386 if (!CLHS || !CRHS)
14387 return SDValue();
14388
14389 // Only 10 bits are used.
14390 static const uint32_t MaxMask = 0x3ff;
14391
14392 uint32_t NewMask =
14393 (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
14394 SDLoc DL(N);
14395 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, Src,
14396 DAG.getConstant(NewMask, DL, MVT::i32));
14397 }
14398
14399 return SDValue();
14400 }
14401
14402 // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
14404 LHS.getOpcode() == AMDGPUISD::PERM &&
14405 isa<ConstantSDNode>(LHS.getOperand(2))) {
14406 uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1));
14407 if (!Sel)
14408 return SDValue();
14409
14410 Sel |= LHS.getConstantOperandVal(2);
14411 SDLoc DL(N);
14412 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
14413 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
14414 }
14415
14416 // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
14417 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
14418 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
14419 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
14420
14421 // If all the uses of an or need to extract the individual elements, do not
14422 // attempt to lower into v_perm
14423 auto usesCombinedOperand = [](SDNode *OrUse) {
14424 // If we have any non-vectorized use, then it is a candidate for v_perm
14425 if (OrUse->getOpcode() != ISD::BITCAST ||
14426 !OrUse->getValueType(0).isVector())
14427 return true;
14428
14429 // If we have any non-vectorized use, then it is a candidate for v_perm
14430 for (auto *VUser : OrUse->users()) {
14431 if (!VUser->getValueType(0).isVector())
14432 return true;
14433
14434 // If the use of a vector is a store, then combining via a v_perm
14435 // is beneficial.
14436 // TODO -- whitelist more uses
14437 for (auto VectorwiseOp : {ISD::STORE, ISD::CopyToReg, ISD::CopyFromReg})
14438 if (VUser->getOpcode() == VectorwiseOp)
14439 return true;
14440 }
14441 return false;
14442 };
14443
14444 if (!any_of(N->users(), usesCombinedOperand))
14445 return SDValue();
14446
14447 uint32_t LHSMask = getPermuteMask(LHS);
14448 uint32_t RHSMask = getPermuteMask(RHS);
14449
14450 if (LHSMask != ~0u && RHSMask != ~0u) {
14451 // Canonicalize the expression in an attempt to have fewer unique masks
14452 // and therefore fewer registers used to hold the masks.
14453 if (LHSMask > RHSMask) {
14454 std::swap(LHSMask, RHSMask);
14455 std::swap(LHS, RHS);
14456 }
14457
14458 // Select 0xc for each lane used from source operand. Zero has 0xc mask
14459 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
14460 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14461 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14462
14463 // Check of we need to combine values from two sources within a byte.
14464 if (!(LHSUsedLanes & RHSUsedLanes) &&
14465 // If we select high and lower word keep it for SDWA.
14466 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
14467 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
14468 // Kill zero bytes selected by other mask. Zero value is 0xc.
14469 LHSMask &= ~RHSUsedLanes;
14470 RHSMask &= ~LHSUsedLanes;
14471 // Add 4 to each active LHS lane
14472 LHSMask |= LHSUsedLanes & 0x04040404;
14473 // Combine masks
14474 uint32_t Sel = LHSMask | RHSMask;
14475 SDLoc DL(N);
14476
14477 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
14478 RHS.getOperand(0),
14479 DAG.getConstant(Sel, DL, MVT::i32));
14480 }
14481 }
14482 if (LHSMask == ~0u || RHSMask == ~0u) {
14483 if (SDValue Perm = matchPERM(N, DCI))
14484 return Perm;
14485 }
14486 }
14487
14488 // Detect identity v2i32 OR and replace with identity source node.
14489 // Specifically an Or that has operands constructed from the same source node
14490 // via extract_vector_elt and build_vector. I.E.
14491 // v2i32 or(
14492 // v2i32 build_vector(
14493 // i32 extract_elt(%IdentitySrc, 0),
14494 // i32 0
14495 // ),
14496 // v2i32 build_vector(
14497 // i32 0,
14498 // i32 extract_elt(%IdentitySrc, 1)
14499 // ) )
14500 // =>
14501 // v2i32 %IdentitySrc
14502
14503 if (VT == MVT::v2i32 && LHS->getOpcode() == ISD::BUILD_VECTOR &&
14504 RHS->getOpcode() == ISD::BUILD_VECTOR) {
14505
14506 ConstantSDNode *LC = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
14507 ConstantSDNode *RC = dyn_cast<ConstantSDNode>(RHS->getOperand(0));
14508
14509 // Test for and normalise build vectors.
14510 if (LC && RC && LC->getZExtValue() == 0 && RC->getZExtValue() == 0) {
14511
14512 // Get the extract_vector_element operands.
14513 SDValue LEVE = LHS->getOperand(0);
14514 SDValue REVE = RHS->getOperand(1);
14515
14516 if (LEVE->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
14518 // Check that different elements from the same vector are
14519 // extracted.
14520 if (LEVE->getOperand(0) == REVE->getOperand(0) &&
14521 LEVE->getOperand(1) != REVE->getOperand(1)) {
14522 SDValue IdentitySrc = LEVE.getOperand(0);
14523 return IdentitySrc;
14524 }
14525 }
14526 }
14527 }
14528
14529 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
14530 return SDValue();
14531
14532 // TODO: This could be a generic combine with a predicate for extracting the
14533 // high half of an integer being free.
14534
14535 // (or i64:x, (zero_extend i32:y)) ->
14536 // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
14537 if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
14538 RHS.getOpcode() != ISD::ZERO_EXTEND)
14539 std::swap(LHS, RHS);
14540
14541 if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
14542 SDValue ExtSrc = RHS.getOperand(0);
14543 EVT SrcVT = ExtSrc.getValueType();
14544 if (SrcVT == MVT::i32) {
14545 SDLoc SL(N);
14546 auto [LowLHS, HiBits] = split64BitValue(LHS, DAG);
14547 SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
14548
14549 DCI.AddToWorklist(LowOr.getNode());
14550 DCI.AddToWorklist(HiBits.getNode());
14551
14552 SDValue Vec =
14553 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, LowOr, HiBits);
14554 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
14555 }
14556 }
14557
14558 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
14559 if (CRHS) {
14560 if (SDValue Split = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR,
14561 N->getOperand(0), CRHS))
14562 return Split;
14563 }
14564
14565 return SDValue();
14566}
14567
14568SDValue SITargetLowering::performXorCombine(SDNode *N,
14569 DAGCombinerInfo &DCI) const {
14570 if (SDValue RV = reassociateScalarOps(N, DCI.DAG))
14571 return RV;
14572
14573 SDValue LHS = N->getOperand(0);
14574 SDValue RHS = N->getOperand(1);
14575
14576 const ConstantSDNode *CRHS = isConstOrConstSplat(RHS);
14577 SelectionDAG &DAG = DCI.DAG;
14578
14579 EVT VT = N->getValueType(0);
14580 if (CRHS && VT == MVT::i64) {
14581 if (SDValue Split =
14582 splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
14583 return Split;
14584 }
14585
14586 // v2i32 (xor (vselect cc, x, y), K) ->
14587 // (v2i32 svelect cc, (xor x, K), (xor y, K)) This enables the xor to be
14588 // replaced with source modifiers when the select is lowered to CNDMASK.
14589 unsigned Opc = LHS.getOpcode();
14590 if (((Opc == ISD::VSELECT && VT == MVT::v2i32) ||
14591 (Opc == ISD::SELECT && VT == MVT::i64)) &&
14592 CRHS && CRHS->getAPIntValue().isSignMask()) {
14593 SDValue CC = LHS->getOperand(0);
14594 SDValue TRUE = LHS->getOperand(1);
14595 SDValue FALSE = LHS->getOperand(2);
14596 SDValue XTrue = DAG.getNode(ISD::XOR, SDLoc(N), VT, TRUE, RHS);
14597 SDValue XFalse = DAG.getNode(ISD::XOR, SDLoc(N), VT, FALSE, RHS);
14598 SDValue XSelect =
14599 DAG.getNode(ISD::VSELECT, SDLoc(N), VT, CC, XTrue, XFalse);
14600 return XSelect;
14601 }
14602
14603 // Make sure to apply the 64-bit constant splitting fold before trying to fold
14604 // fneg-like xors into 64-bit select.
14605 if (LHS.getOpcode() == ISD::SELECT && VT == MVT::i32) {
14606 // This looks like an fneg, try to fold as a source modifier.
14607 if (CRHS && CRHS->getAPIntValue().isSignMask() &&
14609 // xor (select c, a, b), 0x80000000 ->
14610 // bitcast (select c, (fneg (bitcast a)), (fneg (bitcast b)))
14611 SDLoc DL(N);
14612 SDValue CastLHS =
14613 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(1));
14614 SDValue CastRHS =
14615 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(2));
14616 SDValue FNegLHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastLHS);
14617 SDValue FNegRHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastRHS);
14618 SDValue NewSelect = DAG.getNode(ISD::SELECT, DL, MVT::f32,
14619 LHS->getOperand(0), FNegLHS, FNegRHS);
14620 return DAG.getNode(ISD::BITCAST, DL, VT, NewSelect);
14621 }
14622 }
14623
14624 return SDValue();
14625}
14626
14627SDValue
14628SITargetLowering::performZeroOrAnyExtendCombine(SDNode *N,
14629 DAGCombinerInfo &DCI) const {
14630 if (!Subtarget->has16BitInsts() ||
14631 DCI.getDAGCombineLevel() < AfterLegalizeTypes)
14632 return SDValue();
14633
14634 EVT VT = N->getValueType(0);
14635 if (VT != MVT::i32)
14636 return SDValue();
14637
14638 SDValue Src = N->getOperand(0);
14639 if (Src.getValueType() != MVT::i16)
14640 return SDValue();
14641
14642 if (!Src->hasOneUse())
14643 return SDValue();
14644
14645 // TODO: We bail out below if SrcOffset is not in the first dword (>= 4). It's
14646 // possible we're missing out on some combine opportunities, but we'd need to
14647 // weigh the cost of extracting the byte from the upper dwords.
14648
14649 std::optional<ByteProvider<SDValue>> BP0 =
14650 calculateByteProvider(SDValue(N, 0), 0, 0, 0);
14651 if (!BP0 || BP0->SrcOffset >= 4 || !BP0->Src)
14652 return SDValue();
14653 SDValue V0 = *BP0->Src;
14654
14655 std::optional<ByteProvider<SDValue>> BP1 =
14656 calculateByteProvider(SDValue(N, 0), 1, 0, 1);
14657 if (!BP1 || BP1->SrcOffset >= 4 || !BP1->Src)
14658 return SDValue();
14659
14660 SDValue V1 = *BP1->Src;
14661
14662 if (V0 == V1)
14663 return SDValue();
14664
14665 SelectionDAG &DAG = DCI.DAG;
14666 SDLoc DL(N);
14667 uint32_t PermMask = 0x0c0c0c0c;
14668 if (V0) {
14669 V0 = DAG.getBitcastedAnyExtOrTrunc(V0, DL, MVT::i32);
14670 PermMask = (PermMask & ~0xFF) | (BP0->SrcOffset + 4);
14671 }
14672
14673 if (V1) {
14674 V1 = DAG.getBitcastedAnyExtOrTrunc(V1, DL, MVT::i32);
14675 PermMask = (PermMask & ~(0xFF << 8)) | (BP1->SrcOffset << 8);
14676 }
14677
14678 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, V0, V1,
14679 DAG.getConstant(PermMask, DL, MVT::i32));
14680}
14681
14682SDValue
14683SITargetLowering::performSignExtendInRegCombine(SDNode *N,
14684 DAGCombinerInfo &DCI) const {
14685 SDValue Src = N->getOperand(0);
14686 auto *VTSign = cast<VTSDNode>(N->getOperand(1));
14687
14688 // Combine s_buffer_load_u8 or s_buffer_load_u16 with sext and replace them
14689 // with s_buffer_load_i8 and s_buffer_load_i16 respectively.
14690 if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE &&
14691 VTSign->getVT() == MVT::i8) ||
14692 (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT &&
14693 VTSign->getVT() == MVT::i16))) {
14694 assert(Subtarget->hasScalarSubwordLoads() &&
14695 "s_buffer_load_{u8, i8} are supported "
14696 "in GFX12 (or newer) architectures.");
14697 EVT VT = Src.getValueType();
14698 unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE)
14699 ? AMDGPUISD::SBUFFER_LOAD_BYTE
14700 : AMDGPUISD::SBUFFER_LOAD_SHORT;
14701 SDLoc DL(N);
14702 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
14703 SDValue Ops[] = {
14704 Src.getOperand(0), // source register
14705 Src.getOperand(1), // offset
14706 Src.getOperand(2) // cachePolicy
14707 };
14708 auto *M = cast<MemSDNode>(Src);
14709 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
14710 Opc, DL, ResList, Ops, M->getMemoryVT(), M->getMemOperand());
14711 SDValue LoadVal = DCI.DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
14712 return LoadVal;
14713 }
14714 if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
14715 VTSign->getVT() == MVT::i8) ||
14716 (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
14717 VTSign->getVT() == MVT::i16)) &&
14718 Src.hasOneUse()) {
14719 auto *M = cast<MemSDNode>(Src);
14720 SDValue Ops[] = {Src.getOperand(0), // Chain
14721 Src.getOperand(1), // rsrc
14722 Src.getOperand(2), // vindex
14723 Src.getOperand(3), // voffset
14724 Src.getOperand(4), // soffset
14725 Src.getOperand(5), // offset
14726 Src.getOperand(6), Src.getOperand(7)};
14727 // replace with BUFFER_LOAD_BYTE/SHORT
14728 SDVTList ResList =
14729 DCI.DAG.getVTList(MVT::i32, Src.getOperand(0).getValueType());
14730 unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE)
14731 ? AMDGPUISD::BUFFER_LOAD_BYTE
14732 : AMDGPUISD::BUFFER_LOAD_SHORT;
14733 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(
14734 Opc, SDLoc(N), ResList, Ops, M->getMemoryVT(), M->getMemOperand());
14735 return DCI.DAG.getMergeValues(
14736 {BufferLoadSignExt, BufferLoadSignExt.getValue(1)}, SDLoc(N));
14737 }
14738 return SDValue();
14739}
14740
14741SDValue SITargetLowering::performClassCombine(SDNode *N,
14742 DAGCombinerInfo &DCI) const {
14743 SelectionDAG &DAG = DCI.DAG;
14744 SDValue Mask = N->getOperand(1);
14745
14746 // fp_class x, 0 -> false
14747 if (isNullConstant(Mask))
14748 return DAG.getConstant(0, SDLoc(N), MVT::i1);
14749
14750 if (N->getOperand(0).isUndef())
14751 return DAG.getUNDEF(MVT::i1);
14752
14753 return SDValue();
14754}
14755
14756SDValue SITargetLowering::performRcpCombine(SDNode *N,
14757 DAGCombinerInfo &DCI) const {
14758 EVT VT = N->getValueType(0);
14759 SDValue N0 = N->getOperand(0);
14760
14761 if (N0.isUndef()) {
14762 return DCI.DAG.getConstantFP(APFloat::getQNaN(VT.getFltSemantics()),
14763 SDLoc(N), VT);
14764 }
14765
14766 if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP ||
14767 N0.getOpcode() == ISD::SINT_TO_FP)) {
14768 return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(N), VT, N0,
14769 N->getFlags());
14770 }
14771
14772 // TODO: Could handle f32 + amdgcn.sqrt but probably never reaches here.
14773 if ((VT == MVT::f16 && N0.getOpcode() == ISD::FSQRT) &&
14774 N->getFlags().hasAllowContract() && N0->getFlags().hasAllowContract()) {
14775 return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(N), VT, N0.getOperand(0),
14776 N->getFlags());
14777 }
14778
14780}
14781
14783 unsigned MaxDepth) const {
14784 unsigned Opcode = Op.getOpcode();
14785 if (Opcode == ISD::FCANONICALIZE)
14786 return true;
14787
14788 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
14789 const auto &F = CFP->getValueAPF();
14790 if (F.isNaN() && F.isSignaling())
14791 return false;
14792 if (!F.isDenormal())
14793 return true;
14794
14795 DenormalMode Mode =
14796 DAG.getMachineFunction().getDenormalMode(F.getSemantics());
14797 return Mode == DenormalMode::getIEEE();
14798 }
14799
14800 // If source is a result of another standard FP operation it is already in
14801 // canonical form.
14802 if (MaxDepth == 0)
14803 return false;
14804
14805 switch (Opcode) {
14806 // These will flush denorms if required.
14807 case ISD::FADD:
14808 case ISD::FSUB:
14809 case ISD::FMUL:
14810 case ISD::FCEIL:
14811 case ISD::FFLOOR:
14812 case ISD::FMA:
14813 case ISD::FMAD:
14814 case ISD::FSQRT:
14815 case ISD::FDIV:
14816 case ISD::FREM:
14817 case ISD::FP_ROUND:
14818 case ISD::FP_EXTEND:
14819 case ISD::FP16_TO_FP:
14820 case ISD::FP_TO_FP16:
14821 case ISD::BF16_TO_FP:
14822 case ISD::FP_TO_BF16:
14823 case ISD::FLDEXP:
14824 case AMDGPUISD::FMUL_LEGACY:
14825 case AMDGPUISD::FMAD_FTZ:
14826 case AMDGPUISD::RCP:
14827 case AMDGPUISD::RSQ:
14828 case AMDGPUISD::RSQ_CLAMP:
14829 case AMDGPUISD::RCP_LEGACY:
14830 case AMDGPUISD::RCP_IFLAG:
14831 case AMDGPUISD::LOG:
14832 case AMDGPUISD::EXP:
14833 case AMDGPUISD::DIV_SCALE:
14834 case AMDGPUISD::DIV_FMAS:
14835 case AMDGPUISD::DIV_FIXUP:
14836 case AMDGPUISD::FRACT:
14837 case AMDGPUISD::CVT_PKRTZ_F16_F32:
14838 case AMDGPUISD::CVT_F32_UBYTE0:
14839 case AMDGPUISD::CVT_F32_UBYTE1:
14840 case AMDGPUISD::CVT_F32_UBYTE2:
14841 case AMDGPUISD::CVT_F32_UBYTE3:
14842 case AMDGPUISD::FP_TO_FP16:
14843 case AMDGPUISD::SIN_HW:
14844 case AMDGPUISD::COS_HW:
14845 return true;
14846
14847 // It can/will be lowered or combined as a bit operation.
14848 // Need to check their input recursively to handle.
14849 case ISD::FNEG:
14850 case ISD::FABS:
14851 case ISD::FCOPYSIGN:
14852 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
14853
14854 case ISD::AND:
14855 if (Op.getValueType() == MVT::i32) {
14856 // Be careful as we only know it is a bitcast floating point type. It
14857 // could be f32, v2f16, we have no way of knowing. Luckily the constant
14858 // value that we optimize for, which comes up in fp32 to bf16 conversions,
14859 // is valid to optimize for all types.
14860 if (auto *RHS = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
14861 if (RHS->getZExtValue() == 0xffff0000) {
14862 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
14863 }
14864 }
14865 }
14866 break;
14867
14868 case ISD::FSIN:
14869 case ISD::FCOS:
14870 case ISD::FSINCOS:
14871 return Op.getValueType().getScalarType() != MVT::f16;
14872
14873 case ISD::FMINNUM:
14874 case ISD::FMAXNUM:
14875 case ISD::FMINNUM_IEEE:
14876 case ISD::FMAXNUM_IEEE:
14877 case ISD::FMINIMUM:
14878 case ISD::FMAXIMUM:
14879 case ISD::FMINIMUMNUM:
14880 case ISD::FMAXIMUMNUM:
14881 case AMDGPUISD::CLAMP:
14882 case AMDGPUISD::FMED3:
14883 case AMDGPUISD::FMAX3:
14884 case AMDGPUISD::FMIN3:
14885 case AMDGPUISD::FMAXIMUM3:
14886 case AMDGPUISD::FMINIMUM3: {
14887 // FIXME: Shouldn't treat the generic operations different based these.
14888 // However, we aren't really required to flush the result from
14889 // minnum/maxnum..
14890
14891 // snans will be quieted, so we only need to worry about denormals.
14892 if (Subtarget->supportsMinMaxDenormModes() ||
14893 // FIXME: denormalsEnabledForType is broken for dynamic
14894 denormalsEnabledForType(DAG, Op.getValueType()))
14895 return true;
14896
14897 // Flushing may be required.
14898 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
14899 // targets need to check their input recursively.
14900
14901 // FIXME: Does this apply with clamp? It's implemented with max.
14902 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
14903 if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1))
14904 return false;
14905 }
14906
14907 return true;
14908 }
14909 case ISD::SELECT: {
14910 return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) &&
14911 isCanonicalized(DAG, Op.getOperand(2), MaxDepth - 1);
14912 }
14913 case ISD::BUILD_VECTOR: {
14914 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
14915 SDValue SrcOp = Op.getOperand(i);
14916 if (!isCanonicalized(DAG, SrcOp, MaxDepth - 1))
14917 return false;
14918 }
14919
14920 return true;
14921 }
14924 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
14925 }
14927 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
14928 isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
14929 }
14930 case ISD::UNDEF:
14931 // Could be anything.
14932 return false;
14933
14934 case ISD::BITCAST:
14935 // TODO: This is incorrect as it loses track of the operand's type. We may
14936 // end up effectively bitcasting from f32 to v2f16 or vice versa, and the
14937 // same bits that are canonicalized in one type need not be in the other.
14938 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
14939 case ISD::TRUNCATE: {
14940 // Hack round the mess we make when legalizing extract_vector_elt
14941 if (Op.getValueType() == MVT::i16) {
14942 SDValue TruncSrc = Op.getOperand(0);
14943 if (TruncSrc.getValueType() == MVT::i32 &&
14944 TruncSrc.getOpcode() == ISD::BITCAST &&
14945 TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {
14946 return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);
14947 }
14948 }
14949 return false;
14950 }
14952 unsigned IntrinsicID = Op.getConstantOperandVal(0);
14953 // TODO: Handle more intrinsics
14954 switch (IntrinsicID) {
14955 case Intrinsic::amdgcn_cvt_pkrtz:
14956 case Intrinsic::amdgcn_cubeid:
14957 case Intrinsic::amdgcn_frexp_mant:
14958 case Intrinsic::amdgcn_fdot2:
14959 case Intrinsic::amdgcn_rcp:
14960 case Intrinsic::amdgcn_rsq:
14961 case Intrinsic::amdgcn_rsq_clamp:
14962 case Intrinsic::amdgcn_rcp_legacy:
14963 case Intrinsic::amdgcn_rsq_legacy:
14964 case Intrinsic::amdgcn_trig_preop:
14965 case Intrinsic::amdgcn_tanh:
14966 case Intrinsic::amdgcn_log:
14967 case Intrinsic::amdgcn_exp2:
14968 case Intrinsic::amdgcn_sqrt:
14969 return true;
14970 default:
14971 break;
14972 }
14973
14974 break;
14975 }
14976 default:
14977 break;
14978 }
14979
14980 // FIXME: denormalsEnabledForType is broken for dynamic
14981 return denormalsEnabledForType(DAG, Op.getValueType()) &&
14982 DAG.isKnownNeverSNaN(Op);
14983}
14984
14986 unsigned MaxDepth) const {
14987 const MachineRegisterInfo &MRI = MF.getRegInfo();
14988 MachineInstr *MI = MRI.getVRegDef(Reg);
14989 unsigned Opcode = MI->getOpcode();
14990
14991 if (Opcode == AMDGPU::G_FCANONICALIZE)
14992 return true;
14993
14994 std::optional<FPValueAndVReg> FCR;
14995 // Constant splat (can be padded with undef) or scalar constant.
14997 if (FCR->Value.isSignaling())
14998 return false;
14999 if (!FCR->Value.isDenormal())
15000 return true;
15001
15002 DenormalMode Mode = MF.getDenormalMode(FCR->Value.getSemantics());
15003 return Mode == DenormalMode::getIEEE();
15004 }
15005
15006 if (MaxDepth == 0)
15007 return false;
15008
15009 switch (Opcode) {
15010 case AMDGPU::G_FADD:
15011 case AMDGPU::G_FSUB:
15012 case AMDGPU::G_FMUL:
15013 case AMDGPU::G_FCEIL:
15014 case AMDGPU::G_FFLOOR:
15015 case AMDGPU::G_FRINT:
15016 case AMDGPU::G_FNEARBYINT:
15017 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
15018 case AMDGPU::G_INTRINSIC_TRUNC:
15019 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
15020 case AMDGPU::G_FMA:
15021 case AMDGPU::G_FMAD:
15022 case AMDGPU::G_FSQRT:
15023 case AMDGPU::G_FDIV:
15024 case AMDGPU::G_FREM:
15025 case AMDGPU::G_FPOW:
15026 case AMDGPU::G_FPEXT:
15027 case AMDGPU::G_FLOG:
15028 case AMDGPU::G_FLOG2:
15029 case AMDGPU::G_FLOG10:
15030 case AMDGPU::G_FPTRUNC:
15031 case AMDGPU::G_AMDGPU_RCP_IFLAG:
15032 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
15033 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
15034 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
15035 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
15036 return true;
15037 case AMDGPU::G_FNEG:
15038 case AMDGPU::G_FABS:
15039 case AMDGPU::G_FCOPYSIGN:
15040 return isCanonicalized(MI->getOperand(1).getReg(), MF, MaxDepth - 1);
15041 case AMDGPU::G_FMINNUM:
15042 case AMDGPU::G_FMAXNUM:
15043 case AMDGPU::G_FMINNUM_IEEE:
15044 case AMDGPU::G_FMAXNUM_IEEE:
15045 case AMDGPU::G_FMINIMUM:
15046 case AMDGPU::G_FMAXIMUM:
15047 case AMDGPU::G_FMINIMUMNUM:
15048 case AMDGPU::G_FMAXIMUMNUM: {
15049 if (Subtarget->supportsMinMaxDenormModes() ||
15050 // FIXME: denormalsEnabledForType is broken for dynamic
15051 denormalsEnabledForType(MRI.getType(Reg), MF))
15052 return true;
15053
15054 [[fallthrough]];
15055 }
15056 case AMDGPU::G_BUILD_VECTOR:
15057 for (const MachineOperand &MO : llvm::drop_begin(MI->operands()))
15058 if (!isCanonicalized(MO.getReg(), MF, MaxDepth - 1))
15059 return false;
15060 return true;
15061 case AMDGPU::G_INTRINSIC:
15062 case AMDGPU::G_INTRINSIC_CONVERGENT:
15063 switch (cast<GIntrinsic>(MI)->getIntrinsicID()) {
15064 case Intrinsic::amdgcn_fmul_legacy:
15065 case Intrinsic::amdgcn_fmad_ftz:
15066 case Intrinsic::amdgcn_sqrt:
15067 case Intrinsic::amdgcn_fmed3:
15068 case Intrinsic::amdgcn_sin:
15069 case Intrinsic::amdgcn_cos:
15070 case Intrinsic::amdgcn_log:
15071 case Intrinsic::amdgcn_exp2:
15072 case Intrinsic::amdgcn_log_clamp:
15073 case Intrinsic::amdgcn_rcp:
15074 case Intrinsic::amdgcn_rcp_legacy:
15075 case Intrinsic::amdgcn_rsq:
15076 case Intrinsic::amdgcn_rsq_clamp:
15077 case Intrinsic::amdgcn_rsq_legacy:
15078 case Intrinsic::amdgcn_div_scale:
15079 case Intrinsic::amdgcn_div_fmas:
15080 case Intrinsic::amdgcn_div_fixup:
15081 case Intrinsic::amdgcn_fract:
15082 case Intrinsic::amdgcn_cvt_pkrtz:
15083 case Intrinsic::amdgcn_cubeid:
15084 case Intrinsic::amdgcn_cubema:
15085 case Intrinsic::amdgcn_cubesc:
15086 case Intrinsic::amdgcn_cubetc:
15087 case Intrinsic::amdgcn_frexp_mant:
15088 case Intrinsic::amdgcn_fdot2:
15089 case Intrinsic::amdgcn_trig_preop:
15090 case Intrinsic::amdgcn_tanh:
15091 return true;
15092 default:
15093 break;
15094 }
15095
15096 [[fallthrough]];
15097 default:
15098 return false;
15099 }
15100
15101 llvm_unreachable("invalid operation");
15102}
15103
15104// Constant fold canonicalize.
15105SDValue SITargetLowering::getCanonicalConstantFP(SelectionDAG &DAG,
15106 const SDLoc &SL, EVT VT,
15107 const APFloat &C) const {
15108 // Flush denormals to 0 if not enabled.
15109 if (C.isDenormal()) {
15110 DenormalMode Mode =
15111 DAG.getMachineFunction().getDenormalMode(C.getSemantics());
15112 if (Mode == DenormalMode::getPreserveSign()) {
15113 return DAG.getConstantFP(
15114 APFloat::getZero(C.getSemantics(), C.isNegative()), SL, VT);
15115 }
15116
15117 if (Mode != DenormalMode::getIEEE())
15118 return SDValue();
15119 }
15120
15121 if (C.isNaN()) {
15122 APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
15123 if (C.isSignaling()) {
15124 // Quiet a signaling NaN.
15125 // FIXME: Is this supposed to preserve payload bits?
15126 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
15127 }
15128
15129 // Make sure it is the canonical NaN bitpattern.
15130 //
15131 // TODO: Can we use -1 as the canonical NaN value since it's an inline
15132 // immediate?
15133 if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
15134 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
15135 }
15136
15137 // Already canonical.
15138 return DAG.getConstantFP(C, SL, VT);
15139}
15140
15142 return Op.isUndef() || isa<ConstantFPSDNode>(Op);
15143}
15144
15145SDValue
15146SITargetLowering::performFCanonicalizeCombine(SDNode *N,
15147 DAGCombinerInfo &DCI) const {
15148 SelectionDAG &DAG = DCI.DAG;
15149 SDValue N0 = N->getOperand(0);
15150 EVT VT = N->getValueType(0);
15151
15152 // fcanonicalize undef -> qnan
15153 if (N0.isUndef()) {
15155 return DAG.getConstantFP(QNaN, SDLoc(N), VT);
15156 }
15157
15158 if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N0)) {
15159 EVT VT = N->getValueType(0);
15160 return getCanonicalConstantFP(DAG, SDLoc(N), VT, CFP->getValueAPF());
15161 }
15162
15163 // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
15164 // (fcanonicalize k)
15165 //
15166 // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
15167
15168 // TODO: This could be better with wider vectors that will be split to v2f16,
15169 // and to consider uses since there aren't that many packed operations.
15170 if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
15171 isTypeLegal(MVT::v2f16)) {
15172 SDLoc SL(N);
15173 SDValue NewElts[2];
15174 SDValue Lo = N0.getOperand(0);
15175 SDValue Hi = N0.getOperand(1);
15176 EVT EltVT = Lo.getValueType();
15177
15179 for (unsigned I = 0; I != 2; ++I) {
15180 SDValue Op = N0.getOperand(I);
15181 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
15182 NewElts[I] =
15183 getCanonicalConstantFP(DAG, SL, EltVT, CFP->getValueAPF());
15184 } else if (Op.isUndef()) {
15185 // Handled below based on what the other operand is.
15186 NewElts[I] = Op;
15187 } else {
15188 NewElts[I] = DAG.getNode(ISD::FCANONICALIZE, SL, EltVT, Op);
15189 }
15190 }
15191
15192 // If one half is undef, and one is constant, prefer a splat vector rather
15193 // than the normal qNaN. If it's a register, prefer 0.0 since that's
15194 // cheaper to use and may be free with a packed operation.
15195 if (NewElts[0].isUndef()) {
15196 if (isa<ConstantFPSDNode>(NewElts[1]))
15197 NewElts[0] = isa<ConstantFPSDNode>(NewElts[1])
15198 ? NewElts[1]
15199 : DAG.getConstantFP(0.0f, SL, EltVT);
15200 }
15201
15202 if (NewElts[1].isUndef()) {
15203 NewElts[1] = isa<ConstantFPSDNode>(NewElts[0])
15204 ? NewElts[0]
15205 : DAG.getConstantFP(0.0f, SL, EltVT);
15206 }
15207
15208 return DAG.getBuildVector(VT, SL, NewElts);
15209 }
15210 }
15211
15212 return SDValue();
15213}
15214
15215static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
15216 switch (Opc) {
15217 case ISD::FMAXNUM:
15218 case ISD::FMAXNUM_IEEE:
15219 case ISD::FMAXIMUMNUM:
15220 return AMDGPUISD::FMAX3;
15221 case ISD::FMAXIMUM:
15222 return AMDGPUISD::FMAXIMUM3;
15223 case ISD::SMAX:
15224 return AMDGPUISD::SMAX3;
15225 case ISD::UMAX:
15226 return AMDGPUISD::UMAX3;
15227 case ISD::FMINNUM:
15228 case ISD::FMINNUM_IEEE:
15229 case ISD::FMINIMUMNUM:
15230 return AMDGPUISD::FMIN3;
15231 case ISD::FMINIMUM:
15232 return AMDGPUISD::FMINIMUM3;
15233 case ISD::SMIN:
15234 return AMDGPUISD::SMIN3;
15235 case ISD::UMIN:
15236 return AMDGPUISD::UMIN3;
15237 default:
15238 llvm_unreachable("Not a min/max opcode");
15239 }
15240}
15241
15242SDValue SITargetLowering::performIntMed3ImmCombine(SelectionDAG &DAG,
15243 const SDLoc &SL, SDValue Src,
15244 SDValue MinVal,
15245 SDValue MaxVal,
15246 bool Signed) const {
15247
15248 // med3 comes from
15249 // min(max(x, K0), K1), K0 < K1
15250 // max(min(x, K0), K1), K1 < K0
15251 //
15252 // "MinVal" and "MaxVal" respectively refer to the rhs of the
15253 // min/max op.
15254 ConstantSDNode *MinK = dyn_cast<ConstantSDNode>(MinVal);
15255 ConstantSDNode *MaxK = dyn_cast<ConstantSDNode>(MaxVal);
15256
15257 if (!MinK || !MaxK)
15258 return SDValue();
15259
15260 if (Signed) {
15261 if (MaxK->getAPIntValue().sge(MinK->getAPIntValue()))
15262 return SDValue();
15263 } else {
15264 if (MaxK->getAPIntValue().uge(MinK->getAPIntValue()))
15265 return SDValue();
15266 }
15267
15268 EVT VT = MinK->getValueType(0);
15269 unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
15270 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
15271 return DAG.getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
15272
15273 // Note: we could also extend to i32 and use i32 med3 if i16 med3 is
15274 // not available, but this is unlikely to be profitable as constants
15275 // will often need to be materialized & extended, especially on
15276 // pre-GFX10 where VOP3 instructions couldn't take literal operands.
15277 return SDValue();
15278}
15279
15282 return C;
15283
15285 if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
15286 return C;
15287 }
15288
15289 return nullptr;
15290}
15291
15292SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
15293 const SDLoc &SL, SDValue Op0,
15294 SDValue Op1) const {
15295 ConstantFPSDNode *K1 = getSplatConstantFP(Op1);
15296 if (!K1)
15297 return SDValue();
15298
15299 ConstantFPSDNode *K0 = getSplatConstantFP(Op0.getOperand(1));
15300 if (!K0)
15301 return SDValue();
15302
15303 // Ordered >= (although NaN inputs should have folded away by now).
15304 if (K0->getValueAPF() > K1->getValueAPF())
15305 return SDValue();
15306
15307 // med3 with a nan input acts like
15308 // v_min_f32(v_min_f32(S0.f32, S1.f32), S2.f32)
15309 //
15310 // So the result depends on whether the IEEE mode bit is enabled or not with a
15311 // signaling nan input.
15312 // ieee=1
15313 // s0 snan: yields s2
15314 // s1 snan: yields s2
15315 // s2 snan: qnan
15316
15317 // s0 qnan: min(s1, s2)
15318 // s1 qnan: min(s0, s2)
15319 // s2 qnan: min(s0, s1)
15320
15321 // ieee=0
15322 // s0 snan: min(s1, s2)
15323 // s1 snan: min(s0, s2)
15324 // s2 snan: qnan
15325
15326 // s0 qnan: min(s1, s2)
15327 // s1 qnan: min(s0, s2)
15328 // s2 qnan: min(s0, s1)
15329 const MachineFunction &MF = DAG.getMachineFunction();
15330 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
15331
15332 // TODO: Check IEEE bit enabled. We can form fmed3 with IEEE=0 regardless of
15333 // whether the input is a signaling nan if op0 is fmaximum or fmaximumnum. We
15334 // can only form if op0 is fmaxnum_ieee if IEEE=1.
15335 EVT VT = Op0.getValueType();
15336 if (Info->getMode().DX10Clamp) {
15337 // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
15338 // hardware fmed3 behavior converting to a min.
15339 // FIXME: Should this be allowing -0.0?
15340 if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0))
15341 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
15342 }
15343
15344 // med3 for f16 is only available on gfx9+, and not available for v2f16.
15345 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
15346 // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
15347 // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
15348 // then give the other result, which is different from med3 with a NaN
15349 // input.
15350 SDValue Var = Op0.getOperand(0);
15351 if (!DAG.isKnownNeverSNaN(Var))
15352 return SDValue();
15353
15354 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
15355
15356 if ((!K0->hasOneUse() || TII->isInlineConstant(K0->getValueAPF())) &&
15357 (!K1->hasOneUse() || TII->isInlineConstant(K1->getValueAPF()))) {
15358 return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0), Var,
15359 SDValue(K0, 0), SDValue(K1, 0));
15360 }
15361 }
15362
15363 return SDValue();
15364}
15365
15366/// \return true if the subtarget supports minimum3 and maximum3 with the given
15367/// base min/max opcode \p Opc for type \p VT.
15368static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc,
15369 EVT VT) {
15370 switch (Opc) {
15371 case ISD::FMINNUM:
15372 case ISD::FMAXNUM:
15373 case ISD::FMINNUM_IEEE:
15374 case ISD::FMAXNUM_IEEE:
15375 case ISD::FMINIMUMNUM:
15376 case ISD::FMAXIMUMNUM:
15377 case AMDGPUISD::FMIN_LEGACY:
15378 case AMDGPUISD::FMAX_LEGACY:
15379 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.hasMin3Max3_16()) ||
15380 (VT == MVT::v2f16 && Subtarget.hasMin3Max3PKF16());
15381 case ISD::FMINIMUM:
15382 case ISD::FMAXIMUM:
15383 return (VT == MVT::f32 && Subtarget.hasMinimum3Maximum3F32()) ||
15384 (VT == MVT::f16 && Subtarget.hasMinimum3Maximum3F16()) ||
15385 (VT == MVT::v2f16 && Subtarget.hasMinimum3Maximum3PKF16());
15386 case ISD::SMAX:
15387 case ISD::SMIN:
15388 case ISD::UMAX:
15389 case ISD::UMIN:
15390 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.hasMin3Max3_16());
15391 default:
15392 return false;
15393 }
15394
15395 llvm_unreachable("not a min/max opcode");
15396}
15397
15398SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
15399 DAGCombinerInfo &DCI) const {
15400 SelectionDAG &DAG = DCI.DAG;
15401
15402 EVT VT = N->getValueType(0);
15403 unsigned Opc = N->getOpcode();
15404 SDValue Op0 = N->getOperand(0);
15405 SDValue Op1 = N->getOperand(1);
15406
15407 // Only do this if the inner op has one use since this will just increases
15408 // register pressure for no benefit.
15409
15410 if (supportsMin3Max3(*Subtarget, Opc, VT)) {
15411 // max(max(a, b), c) -> max3(a, b, c)
15412 // min(min(a, b), c) -> min3(a, b, c)
15413 if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
15414 SDLoc DL(N);
15415 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0),
15416 Op0.getOperand(0), Op0.getOperand(1), Op1);
15417 }
15418
15419 // Try commuted.
15420 // max(a, max(b, c)) -> max3(a, b, c)
15421 // min(a, min(b, c)) -> min3(a, b, c)
15422 if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
15423 SDLoc DL(N);
15424 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0),
15425 Op0, Op1.getOperand(0), Op1.getOperand(1));
15426 }
15427 }
15428
15429 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
15430 // max(min(x, K0), K1), K1 < K0 -> med3(x, K1, K0)
15431 if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
15432 if (SDValue Med3 = performIntMed3ImmCombine(
15433 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), true))
15434 return Med3;
15435 }
15436 if (Opc == ISD::SMAX && Op0.getOpcode() == ISD::SMIN && Op0.hasOneUse()) {
15437 if (SDValue Med3 = performIntMed3ImmCombine(
15438 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, true))
15439 return Med3;
15440 }
15441
15442 if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
15443 if (SDValue Med3 = performIntMed3ImmCombine(
15444 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), false))
15445 return Med3;
15446 }
15447 if (Opc == ISD::UMAX && Op0.getOpcode() == ISD::UMIN && Op0.hasOneUse()) {
15448 if (SDValue Med3 = performIntMed3ImmCombine(
15449 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, false))
15450 return Med3;
15451 }
15452
15453 // if !is_snan(x):
15454 // fminnum(fmaxnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
15455 // fminnum_ieee(fmaxnum_ieee(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
15456 // fminnumnum(fmaxnumnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
15457 // fmin_legacy(fmax_legacy(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
15458 if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
15461 (Opc == AMDGPUISD::FMIN_LEGACY &&
15462 Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
15463 (VT == MVT::f32 || VT == MVT::f64 ||
15464 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
15465 (VT == MVT::bf16 && Subtarget->hasBF16PackedInsts()) ||
15466 (VT == MVT::v2bf16 && Subtarget->hasBF16PackedInsts()) ||
15467 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
15468 Op0.hasOneUse()) {
15469 if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
15470 return Res;
15471 }
15472
15473 // Prefer fminnum_ieee over fminimum. For gfx950, minimum/maximum are legal
15474 // for some types, but at a higher cost since it's implemented with a 3
15475 // operand form.
15476 const SDNodeFlags Flags = N->getFlags();
15477 if ((Opc == ISD::FMINIMUM || Opc == ISD::FMAXIMUM) && Flags.hasNoNaNs() &&
15478 !Subtarget->hasIEEEMinimumMaximumInsts() &&
15480 unsigned NewOpc =
15482 return DAG.getNode(NewOpc, SDLoc(N), VT, Op0, Op1, Flags);
15483 }
15484
15485 return SDValue();
15486}
15487
15491 // FIXME: Should this be allowing -0.0?
15492 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
15493 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
15494 }
15495 }
15496
15497 return false;
15498}
15499
15500// FIXME: Should only worry about snans for version with chain.
15501SDValue SITargetLowering::performFMed3Combine(SDNode *N,
15502 DAGCombinerInfo &DCI) const {
15503 EVT VT = N->getValueType(0);
15504 // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
15505 // NaNs. With a NaN input, the order of the operands may change the result.
15506
15507 SelectionDAG &DAG = DCI.DAG;
15508 SDLoc SL(N);
15509
15510 SDValue Src0 = N->getOperand(0);
15511 SDValue Src1 = N->getOperand(1);
15512 SDValue Src2 = N->getOperand(2);
15513
15514 if (isClampZeroToOne(Src0, Src1)) {
15515 // const_a, const_b, x -> clamp is safe in all cases including signaling
15516 // nans.
15517 // FIXME: Should this be allowing -0.0?
15518 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
15519 }
15520
15521 const MachineFunction &MF = DAG.getMachineFunction();
15522 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
15523
15524 // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
15525 // handling no dx10-clamp?
15526 if (Info->getMode().DX10Clamp) {
15527 // If NaNs is clamped to 0, we are free to reorder the inputs.
15528
15529 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
15530 std::swap(Src0, Src1);
15531
15532 if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
15533 std::swap(Src1, Src2);
15534
15535 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
15536 std::swap(Src0, Src1);
15537
15538 if (isClampZeroToOne(Src1, Src2))
15539 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
15540 }
15541
15542 return SDValue();
15543}
15544
15545SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
15546 DAGCombinerInfo &DCI) const {
15547 SDValue Src0 = N->getOperand(0);
15548 SDValue Src1 = N->getOperand(1);
15549 if (Src0.isUndef() && Src1.isUndef())
15550 return DCI.DAG.getUNDEF(N->getValueType(0));
15551 return SDValue();
15552}
15553
15554// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
15555// expanded into a set of cmp/select instructions.
15557 unsigned NumElem,
15558 bool IsDivergentIdx,
15559 const GCNSubtarget *Subtarget) {
15561 return false;
15562
15563 unsigned VecSize = EltSize * NumElem;
15564
15565 // Sub-dword vectors of size 2 dword or less have better implementation.
15566 if (VecSize <= 64 && EltSize < 32)
15567 return false;
15568
15569 // Always expand the rest of sub-dword instructions, otherwise it will be
15570 // lowered via memory.
15571 if (EltSize < 32)
15572 return true;
15573
15574 // Always do this if var-idx is divergent, otherwise it will become a loop.
15575 if (IsDivergentIdx)
15576 return true;
15577
15578 // Large vectors would yield too many compares and v_cndmask_b32 instructions.
15579 unsigned NumInsts = NumElem /* Number of compares */ +
15580 ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */;
15581
15582 // On some architectures (GFX9) movrel is not available and it's better
15583 // to expand.
15584 if (Subtarget->useVGPRIndexMode())
15585 return NumInsts <= 16;
15586
15587 // If movrel is available, use it instead of expanding for vector of 8
15588 // elements.
15589 if (Subtarget->hasMovrel())
15590 return NumInsts <= 15;
15591
15592 return true;
15593}
15594
15596 SDValue Idx = N->getOperand(N->getNumOperands() - 1);
15597 if (isa<ConstantSDNode>(Idx))
15598 return false;
15599
15600 SDValue Vec = N->getOperand(0);
15601 EVT VecVT = Vec.getValueType();
15602 EVT EltVT = VecVT.getVectorElementType();
15603 unsigned EltSize = EltVT.getSizeInBits();
15604 unsigned NumElem = VecVT.getVectorNumElements();
15605
15607 EltSize, NumElem, Idx->isDivergent(), getSubtarget());
15608}
15609
15610SDValue
15611SITargetLowering::performExtractVectorEltCombine(SDNode *N,
15612 DAGCombinerInfo &DCI) const {
15613 SDValue Vec = N->getOperand(0);
15614 SelectionDAG &DAG = DCI.DAG;
15615
15616 EVT VecVT = Vec.getValueType();
15617 EVT VecEltVT = VecVT.getVectorElementType();
15618 EVT ResVT = N->getValueType(0);
15619
15620 unsigned VecSize = VecVT.getSizeInBits();
15621 unsigned VecEltSize = VecEltVT.getSizeInBits();
15622
15623 if ((Vec.getOpcode() == ISD::FNEG || Vec.getOpcode() == ISD::FABS) &&
15625 SDLoc SL(N);
15626 SDValue Idx = N->getOperand(1);
15627 SDValue Elt =
15628 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec.getOperand(0), Idx);
15629 return DAG.getNode(Vec.getOpcode(), SL, ResVT, Elt);
15630 }
15631
15632 // (extract_vector_element (and {y0, y1}, (build_vector 0x1f, 0x1f)), index)
15633 // -> (and (extract_vector_element {y0, y1}, index), 0x1f)
15634 // There are optimisations to transform 64-bit shifts into 32-bit shifts
15635 // depending on the shift operand. See e.g. performSraCombine().
15636 // This combine ensures that the optimisation is compatible with v2i32
15637 // legalised AND.
15638 if (VecVT == MVT::v2i32 && Vec->getOpcode() == ISD::AND &&
15639 Vec->getOperand(1)->getOpcode() == ISD::BUILD_VECTOR) {
15640
15642 if (!C || C->getZExtValue() != 0x1f)
15643 return SDValue();
15644
15645 SDLoc SL(N);
15646 SDValue AndMask = DAG.getConstant(0x1f, SL, MVT::i32);
15647 SDValue EVE = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32,
15648 Vec->getOperand(0), N->getOperand(1));
15649 SDValue A = DAG.getNode(ISD::AND, SL, MVT::i32, EVE, AndMask);
15650 DAG.ReplaceAllUsesWith(N, A.getNode());
15651 }
15652
15653 // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
15654 // =>
15655 // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
15656 // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
15657 // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
15658 if (Vec.hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
15659 SDLoc SL(N);
15660 SDValue Idx = N->getOperand(1);
15661 unsigned Opc = Vec.getOpcode();
15662
15663 switch (Opc) {
15664 default:
15665 break;
15666 // TODO: Support other binary operations.
15667 case ISD::FADD:
15668 case ISD::FSUB:
15669 case ISD::FMUL:
15670 case ISD::ADD:
15671 case ISD::UMIN:
15672 case ISD::UMAX:
15673 case ISD::SMIN:
15674 case ISD::SMAX:
15675 case ISD::FMAXNUM:
15676 case ISD::FMINNUM:
15677 case ISD::FMAXNUM_IEEE:
15678 case ISD::FMINNUM_IEEE:
15679 case ISD::FMAXIMUM:
15680 case ISD::FMINIMUM: {
15681 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
15682 Vec.getOperand(0), Idx);
15683 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
15684 Vec.getOperand(1), Idx);
15685
15686 DCI.AddToWorklist(Elt0.getNode());
15687 DCI.AddToWorklist(Elt1.getNode());
15688 return DAG.getNode(Opc, SL, ResVT, Elt0, Elt1, Vec->getFlags());
15689 }
15690 }
15691 }
15692
15693 // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
15695 SDLoc SL(N);
15696 SDValue Idx = N->getOperand(1);
15697 SDValue V;
15698 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
15699 SDValue IC = DAG.getVectorIdxConstant(I, SL);
15700 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec, IC);
15701 if (I == 0)
15702 V = Elt;
15703 else
15704 V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ);
15705 }
15706 return V;
15707 }
15708
15709 if (!DCI.isBeforeLegalize())
15710 return SDValue();
15711
15712 // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
15713 // elements. This exposes more load reduction opportunities by replacing
15714 // multiple small extract_vector_elements with a single 32-bit extract.
15715 auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
15716 if (isa<MemSDNode>(Vec) && VecEltSize <= 16 && VecEltVT.isByteSized() &&
15717 VecSize > 32 && VecSize % 32 == 0 && Idx) {
15718 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT);
15719
15720 unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
15721 unsigned EltIdx = BitIndex / 32;
15722 unsigned LeftoverBitIdx = BitIndex % 32;
15723 SDLoc SL(N);
15724
15725 SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec);
15726 DCI.AddToWorklist(Cast.getNode());
15727
15728 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast,
15729 DAG.getConstant(EltIdx, SL, MVT::i32));
15730 DCI.AddToWorklist(Elt.getNode());
15731 SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt,
15732 DAG.getConstant(LeftoverBitIdx, SL, MVT::i32));
15733 DCI.AddToWorklist(Srl.getNode());
15734
15735 EVT VecEltAsIntVT = VecEltVT.changeTypeToInteger();
15736 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VecEltAsIntVT, Srl);
15737 DCI.AddToWorklist(Trunc.getNode());
15738
15739 if (VecEltVT == ResVT) {
15740 return DAG.getNode(ISD::BITCAST, SL, VecEltVT, Trunc);
15741 }
15742
15743 assert(ResVT.isScalarInteger());
15744 return DAG.getAnyExtOrTrunc(Trunc, SL, ResVT);
15745 }
15746
15747 return SDValue();
15748}
15749
15750SDValue
15751SITargetLowering::performInsertVectorEltCombine(SDNode *N,
15752 DAGCombinerInfo &DCI) const {
15753 SDValue Vec = N->getOperand(0);
15754 SDValue Idx = N->getOperand(2);
15755 EVT VecVT = Vec.getValueType();
15756 EVT EltVT = VecVT.getVectorElementType();
15757
15758 // INSERT_VECTOR_ELT (<n x e>, var-idx)
15759 // => BUILD_VECTOR n x select (e, const-idx)
15761 return SDValue();
15762
15763 SelectionDAG &DAG = DCI.DAG;
15764 SDLoc SL(N);
15765 SDValue Ins = N->getOperand(1);
15766 EVT IdxVT = Idx.getValueType();
15767
15769 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
15770 SDValue IC = DAG.getConstant(I, SL, IdxVT);
15771 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
15772 SDValue V = DAG.getSelectCC(SL, Idx, IC, Ins, Elt, ISD::SETEQ);
15773 Ops.push_back(V);
15774 }
15775
15776 return DAG.getBuildVector(VecVT, SL, Ops);
15777}
15778
15779/// Return the source of an fp_extend from f16 to f32, or a converted FP
15780/// constant.
15782 if (Src.getOpcode() == ISD::FP_EXTEND &&
15783 Src.getOperand(0).getValueType() == MVT::f16) {
15784 return Src.getOperand(0);
15785 }
15786
15787 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Src)) {
15788 APFloat Val = CFP->getValueAPF();
15789 bool LosesInfo = true;
15791 if (!LosesInfo)
15792 return DAG.getConstantFP(Val, SDLoc(Src), MVT::f16);
15793 }
15794
15795 return SDValue();
15796}
15797
15798SDValue SITargetLowering::performFPRoundCombine(SDNode *N,
15799 DAGCombinerInfo &DCI) const {
15800 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
15801 "combine only useful on gfx8");
15802
15803 SDValue TruncSrc = N->getOperand(0);
15804 EVT VT = N->getValueType(0);
15805 if (VT != MVT::f16)
15806 return SDValue();
15807
15808 if (TruncSrc.getOpcode() != AMDGPUISD::FMED3 ||
15809 TruncSrc.getValueType() != MVT::f32 || !TruncSrc.hasOneUse())
15810 return SDValue();
15811
15812 SelectionDAG &DAG = DCI.DAG;
15813 SDLoc SL(N);
15814
15815 // Optimize f16 fmed3 pattern performed on f32. On gfx8 there is no f16 fmed3,
15816 // and expanding it with min/max saves 1 instruction vs. casting to f32 and
15817 // casting back.
15818
15819 // fptrunc (f32 (fmed3 (fpext f16:a, fpext f16:b, fpext f16:c))) =>
15820 // fmin(fmax(a, b), fmax(fmin(a, b), c))
15821 SDValue A = strictFPExtFromF16(DAG, TruncSrc.getOperand(0));
15822 if (!A)
15823 return SDValue();
15824
15825 SDValue B = strictFPExtFromF16(DAG, TruncSrc.getOperand(1));
15826 if (!B)
15827 return SDValue();
15828
15829 SDValue C = strictFPExtFromF16(DAG, TruncSrc.getOperand(2));
15830 if (!C)
15831 return SDValue();
15832
15833 // This changes signaling nan behavior. If an input is a signaling nan, it
15834 // would have been quieted by the fpext originally. We don't care because
15835 // these are unconstrained ops. If we needed to insert quieting canonicalizes
15836 // we would be worse off than just doing the promotion.
15837 SDValue A1 = DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, A, B);
15838 SDValue B1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A, B);
15839 SDValue C1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A1, C);
15840 return DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, B1, C1);
15841}
15842
15843unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
15844 const SDNode *N0,
15845 const SDNode *N1) const {
15846 EVT VT = N0->getValueType(0);
15847
15848 // Only do this if we are not trying to support denormals. v_mad_f32 does not
15849 // support denormals ever.
15850 if (((VT == MVT::f32 &&
15852 (VT == MVT::f16 && Subtarget->hasMadF16() &&
15855 return ISD::FMAD;
15856
15857 const TargetOptions &Options = DAG.getTarget().Options;
15858 if ((Options.AllowFPOpFusion == FPOpFusion::Fast ||
15859 (N0->getFlags().hasAllowContract() &&
15860 N1->getFlags().hasAllowContract())) &&
15862 return ISD::FMA;
15863 }
15864
15865 return 0;
15866}
15867
15868// For a reassociatable opcode perform:
15869// op x, (op y, z) -> op (op x, z), y, if x and z are uniform
15870SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
15871 SelectionDAG &DAG) const {
15872 EVT VT = N->getValueType(0);
15873 if (VT != MVT::i32 && VT != MVT::i64)
15874 return SDValue();
15875
15876 if (DAG.isBaseWithConstantOffset(SDValue(N, 0)))
15877 return SDValue();
15878
15879 unsigned Opc = N->getOpcode();
15880 SDValue Op0 = N->getOperand(0);
15881 SDValue Op1 = N->getOperand(1);
15882
15883 if (!(Op0->isDivergent() ^ Op1->isDivergent()))
15884 return SDValue();
15885
15886 if (Op0->isDivergent())
15887 std::swap(Op0, Op1);
15888
15889 if (Op1.getOpcode() != Opc || !Op1.hasOneUse())
15890 return SDValue();
15891
15892 SDValue Op2 = Op1.getOperand(1);
15893 Op1 = Op1.getOperand(0);
15894 if (!(Op1->isDivergent() ^ Op2->isDivergent()))
15895 return SDValue();
15896
15897 if (Op1->isDivergent())
15898 std::swap(Op1, Op2);
15899
15900 SDLoc SL(N);
15901 SDValue Add1 = DAG.getNode(Opc, SL, VT, Op0, Op1);
15902 return DAG.getNode(Opc, SL, VT, Add1, Op2);
15903}
15904
15905static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT,
15906 SDValue N0, SDValue N1, SDValue N2, bool Signed) {
15908 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1);
15909 SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2);
15910 return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
15911}
15912
15913// Fold
15914// y = lshr i64 x, 32
15915// res = add (mul i64 y, Const), x where "Const" is a 64-bit constant
15916// with Const.hi == -1
15917// To
15918// res = mad_u64_u32 y.lo ,Const.lo, x.lo
15920 SDValue MulLHS, SDValue MulRHS,
15921 SDValue AddRHS) {
15922 if (MulRHS.getOpcode() == ISD::SRL)
15923 std::swap(MulLHS, MulRHS);
15924
15925 if (MulLHS.getValueType() != MVT::i64 || MulLHS.getOpcode() != ISD::SRL)
15926 return SDValue();
15927
15928 ConstantSDNode *ShiftVal = dyn_cast<ConstantSDNode>(MulLHS.getOperand(1));
15929 if (!ShiftVal || ShiftVal->getAsZExtVal() != 32 ||
15930 MulLHS.getOperand(0) != AddRHS)
15931 return SDValue();
15932
15934 if (!Const || Hi_32(Const->getZExtValue()) != uint32_t(-1))
15935 return SDValue();
15936
15937 SDValue ConstMul =
15938 DAG.getConstant(Lo_32(Const->getZExtValue()), SL, MVT::i32);
15939 return getMad64_32(DAG, SL, MVT::i64,
15940 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS), ConstMul,
15941 DAG.getZeroExtendInReg(AddRHS, SL, MVT::i32), false);
15942}
15943
15944// Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high
15945// multiplies, if any.
15946//
15947// Full 64-bit multiplies that feed into an addition are lowered here instead
15948// of using the generic expansion. The generic expansion ends up with
15949// a tree of ADD nodes that prevents us from using the "add" part of the
15950// MAD instruction. The expansion produced here results in a chain of ADDs
15951// instead of a tree.
15952SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
15953 DAGCombinerInfo &DCI) const {
15954 assert(N->isAnyAdd());
15955
15956 SelectionDAG &DAG = DCI.DAG;
15957 EVT VT = N->getValueType(0);
15958 SDLoc SL(N);
15959 SDValue LHS = N->getOperand(0);
15960 SDValue RHS = N->getOperand(1);
15961
15962 if (VT.isVector())
15963 return SDValue();
15964
15965 // S_MUL_HI_[IU]32 was added in gfx9, which allows us to keep the overall
15966 // result in scalar registers for uniform values.
15967 if (!N->isDivergent() && Subtarget->hasSMulHi())
15968 return SDValue();
15969
15970 unsigned NumBits = VT.getScalarSizeInBits();
15971 if (NumBits <= 32 || NumBits > 64)
15972 return SDValue();
15973
15974 if (LHS.getOpcode() != ISD::MUL) {
15975 assert(RHS.getOpcode() == ISD::MUL);
15976 std::swap(LHS, RHS);
15977 }
15978
15979 // Avoid the fold if it would unduly increase the number of multiplies due to
15980 // multiple uses, except on hardware with full-rate multiply-add (which is
15981 // part of full-rate 64-bit ops).
15982 if (!Subtarget->hasFullRate64Ops()) {
15983 unsigned NumUsers = 0;
15984 for (SDNode *User : LHS->users()) {
15985 // There is a use that does not feed into addition, so the multiply can't
15986 // be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
15987 if (!User->isAnyAdd())
15988 return SDValue();
15989
15990 // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
15991 // MUL + 3xADD + 3xADDC over 3xMAD.
15992 ++NumUsers;
15993 if (NumUsers >= 3)
15994 return SDValue();
15995 }
15996 }
15997
15998 SDValue MulLHS = LHS.getOperand(0);
15999 SDValue MulRHS = LHS.getOperand(1);
16000 SDValue AddRHS = RHS;
16001
16002 if (SDValue FoldedMAD = tryFoldMADwithSRL(DAG, SL, MulLHS, MulRHS, AddRHS))
16003 return FoldedMAD;
16004
16005 // Always check whether operands are small unsigned values, since that
16006 // knowledge is useful in more cases. Check for small signed values only if
16007 // doing so can unlock a shorter code sequence.
16008 bool MulLHSUnsigned32 = numBitsUnsigned(MulLHS, DAG) <= 32;
16009 bool MulRHSUnsigned32 = numBitsUnsigned(MulRHS, DAG) <= 32;
16010
16011 bool MulSignedLo = false;
16012 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
16013 MulSignedLo =
16014 numBitsSigned(MulLHS, DAG) <= 32 && numBitsSigned(MulRHS, DAG) <= 32;
16015 }
16016
16017 // The operands and final result all have the same number of bits. If
16018 // operands need to be extended, they can be extended with garbage. The
16019 // resulting garbage in the high bits of the mad_[iu]64_[iu]32 result is
16020 // truncated away in the end.
16021 if (VT != MVT::i64) {
16022 MulLHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulLHS);
16023 MulRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulRHS);
16024 AddRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, AddRHS);
16025 }
16026
16027 // The basic code generated is conceptually straightforward. Pseudo code:
16028 //
16029 // accum = mad_64_32 lhs.lo, rhs.lo, accum
16030 // accum.hi = add (mul lhs.hi, rhs.lo), accum.hi
16031 // accum.hi = add (mul lhs.lo, rhs.hi), accum.hi
16032 //
16033 // The second and third lines are optional, depending on whether the factors
16034 // are {sign,zero}-extended or not.
16035 //
16036 // The actual DAG is noisier than the pseudo code, but only due to
16037 // instructions that disassemble values into low and high parts, and
16038 // assemble the final result.
16039 SDValue One = DAG.getConstant(1, SL, MVT::i32);
16040
16041 auto MulLHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS);
16042 auto MulRHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulRHS);
16043 SDValue Accum =
16044 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
16045
16046 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
16047 auto [AccumLo, AccumHi] = DAG.SplitScalar(Accum, SL, MVT::i32, MVT::i32);
16048
16049 if (!MulLHSUnsigned32) {
16050 auto MulLHSHi =
16051 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulLHS, One);
16052 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSHi, MulRHSLo);
16053 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
16054 }
16055
16056 if (!MulRHSUnsigned32) {
16057 auto MulRHSHi =
16058 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulRHS, One);
16059 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSLo, MulRHSHi);
16060 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
16061 }
16062
16063 Accum = DAG.getBuildVector(MVT::v2i32, SL, {AccumLo, AccumHi});
16064 Accum = DAG.getBitcast(MVT::i64, Accum);
16065 }
16066
16067 if (VT != MVT::i64)
16068 Accum = DAG.getNode(ISD::TRUNCATE, SL, VT, Accum);
16069 return Accum;
16070}
16071
16072SDValue
16073SITargetLowering::foldAddSub64WithZeroLowBitsTo32(SDNode *N,
16074 DAGCombinerInfo &DCI) const {
16075 SDValue RHS = N->getOperand(1);
16076 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
16077 if (!CRHS)
16078 return SDValue();
16079
16080 // TODO: Worth using computeKnownBits? Maybe expensive since it's so
16081 // common.
16082 uint64_t Val = CRHS->getZExtValue();
16083 if (countr_zero(Val) >= 32) {
16084 SelectionDAG &DAG = DCI.DAG;
16085 SDLoc SL(N);
16086 SDValue LHS = N->getOperand(0);
16087
16088 // Avoid carry machinery if we know the low half of the add does not
16089 // contribute to the final result.
16090 //
16091 // add i64:x, K if computeTrailingZeros(K) >= 32
16092 // => build_pair (add x.hi, K.hi), x.lo
16093
16094 // Breaking the 64-bit add here with this strange constant is unlikely
16095 // to interfere with addressing mode patterns.
16096
16097 SDValue Hi = getHiHalf64(LHS, DAG);
16098 SDValue ConstHi32 = DAG.getConstant(Hi_32(Val), SL, MVT::i32);
16099 unsigned Opcode = N->getOpcode();
16100 if (Opcode == ISD::PTRADD)
16101 Opcode = ISD::ADD;
16102 SDValue AddHi =
16103 DAG.getNode(Opcode, SL, MVT::i32, Hi, ConstHi32, N->getFlags());
16104
16105 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
16106 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Lo, AddHi);
16107 }
16108
16109 return SDValue();
16110}
16111
16112// Collect the ultimate src of each of the mul node's operands, and confirm
16113// each operand is 8 bytes.
16114static std::optional<ByteProvider<SDValue>>
16115handleMulOperand(const SDValue &MulOperand) {
16116 auto Byte0 = calculateByteProvider(MulOperand, 0, 0);
16117 if (!Byte0 || Byte0->isConstantZero()) {
16118 return std::nullopt;
16119 }
16120 auto Byte1 = calculateByteProvider(MulOperand, 1, 0);
16121 if (Byte1 && !Byte1->isConstantZero()) {
16122 return std::nullopt;
16123 }
16124 return Byte0;
16125}
16126
16127static unsigned addPermMasks(unsigned First, unsigned Second) {
16128 unsigned FirstCs = First & 0x0c0c0c0c;
16129 unsigned SecondCs = Second & 0x0c0c0c0c;
16130 unsigned FirstNoCs = First & ~0x0c0c0c0c;
16131 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
16132
16133 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
16134 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
16135 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
16136 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
16137
16138 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
16139}
16140
16141struct DotSrc {
16143 int64_t PermMask;
16145};
16146
16150 SmallVectorImpl<DotSrc> &Src1s, int Step) {
16151
16152 assert(Src0.Src.has_value() && Src1.Src.has_value());
16153 // Src0s and Src1s are empty, just place arbitrarily.
16154 if (Step == 0) {
16155 Src0s.push_back({*Src0.Src, ((Src0.SrcOffset % 4) << 24) + 0x0c0c0c,
16156 Src0.SrcOffset / 4});
16157 Src1s.push_back({*Src1.Src, ((Src1.SrcOffset % 4) << 24) + 0x0c0c0c,
16158 Src1.SrcOffset / 4});
16159 return;
16160 }
16161
16162 for (int BPI = 0; BPI < 2; BPI++) {
16163 std::pair<ByteProvider<SDValue>, ByteProvider<SDValue>> BPP = {Src0, Src1};
16164 if (BPI == 1) {
16165 BPP = {Src1, Src0};
16166 }
16167 unsigned ZeroMask = 0x0c0c0c0c;
16168 unsigned FMask = 0xFF << (8 * (3 - Step));
16169
16170 unsigned FirstMask =
16171 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
16172 unsigned SecondMask =
16173 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
16174 // Attempt to find Src vector which contains our SDValue, if so, add our
16175 // perm mask to the existing one. If we are unable to find a match for the
16176 // first SDValue, attempt to find match for the second.
16177 int FirstGroup = -1;
16178 for (int I = 0; I < 2; I++) {
16179 SmallVectorImpl<DotSrc> &Srcs = I == 0 ? Src0s : Src1s;
16180 auto MatchesFirst = [&BPP](DotSrc &IterElt) {
16181 return IterElt.SrcOp == *BPP.first.Src &&
16182 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
16183 };
16184
16185 auto *Match = llvm::find_if(Srcs, MatchesFirst);
16186 if (Match != Srcs.end()) {
16187 Match->PermMask = addPermMasks(FirstMask, Match->PermMask);
16188 FirstGroup = I;
16189 break;
16190 }
16191 }
16192 if (FirstGroup != -1) {
16193 SmallVectorImpl<DotSrc> &Srcs = FirstGroup == 1 ? Src0s : Src1s;
16194 auto MatchesSecond = [&BPP](DotSrc &IterElt) {
16195 return IterElt.SrcOp == *BPP.second.Src &&
16196 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
16197 };
16198 auto *Match = llvm::find_if(Srcs, MatchesSecond);
16199 if (Match != Srcs.end()) {
16200 Match->PermMask = addPermMasks(SecondMask, Match->PermMask);
16201 } else
16202 Srcs.push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
16203 return;
16204 }
16205 }
16206
16207 // If we have made it here, then we could not find a match in Src0s or Src1s
16208 // for either Src0 or Src1, so just place them arbitrarily.
16209
16210 unsigned ZeroMask = 0x0c0c0c0c;
16211 unsigned FMask = 0xFF << (8 * (3 - Step));
16212
16213 Src0s.push_back(
16214 {*Src0.Src,
16215 ((Src0.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
16216 Src0.SrcOffset / 4});
16217 Src1s.push_back(
16218 {*Src1.Src,
16219 ((Src1.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
16220 Src1.SrcOffset / 4});
16221}
16222
16224 SmallVectorImpl<DotSrc> &Srcs, bool IsSigned,
16225 bool IsAny) {
16226
16227 // If we just have one source, just permute it accordingly.
16228 if (Srcs.size() == 1) {
16229 auto *Elt = Srcs.begin();
16230 auto EltOp = getDWordFromOffset(DAG, SL, Elt->SrcOp, Elt->DWordOffset);
16231
16232 // v_perm will produce the original value
16233 if (Elt->PermMask == 0x3020100)
16234 return EltOp;
16235
16236 return DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
16237 DAG.getConstant(Elt->PermMask, SL, MVT::i32));
16238 }
16239
16240 auto *FirstElt = Srcs.begin();
16241 auto *SecondElt = std::next(FirstElt);
16242
16244
16245 // If we have multiple sources in the chain, combine them via perms (using
16246 // calculated perm mask) and Ors.
16247 while (true) {
16248 auto FirstMask = FirstElt->PermMask;
16249 auto SecondMask = SecondElt->PermMask;
16250
16251 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
16252 unsigned FirstPlusFour = FirstMask | 0x04040404;
16253 // 0x0c + 0x04 = 0x10, so anding with 0x0F will produced 0x00 for any
16254 // original 0x0C.
16255 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
16256
16257 auto PermMask = addPermMasks(FirstMask, SecondMask);
16258 auto FirstVal =
16259 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
16260 auto SecondVal =
16261 getDWordFromOffset(DAG, SL, SecondElt->SrcOp, SecondElt->DWordOffset);
16262
16263 Perms.push_back(DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, FirstVal,
16264 SecondVal,
16265 DAG.getConstant(PermMask, SL, MVT::i32)));
16266
16267 FirstElt = std::next(SecondElt);
16268 if (FirstElt == Srcs.end())
16269 break;
16270
16271 SecondElt = std::next(FirstElt);
16272 // If we only have a FirstElt, then just combine that into the cumulative
16273 // source node.
16274 if (SecondElt == Srcs.end()) {
16275 auto EltOp =
16276 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
16277
16278 Perms.push_back(
16279 DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
16280 DAG.getConstant(FirstElt->PermMask, SL, MVT::i32)));
16281 break;
16282 }
16283 }
16284
16285 assert(Perms.size() == 1 || Perms.size() == 2);
16286 return Perms.size() == 2
16287 ? DAG.getNode(ISD::OR, SL, MVT::i32, Perms[0], Perms[1])
16288 : Perms[0];
16289}
16290
16291static void fixMasks(SmallVectorImpl<DotSrc> &Srcs, unsigned ChainLength) {
16292 for (auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
16293 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
16294 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
16295 EntryMask += ZeroMask;
16296 }
16297}
16298
16299static bool isMul(const SDValue Op) {
16300 auto Opcode = Op.getOpcode();
16301
16302 return (Opcode == ISD::MUL || Opcode == AMDGPUISD::MUL_U24 ||
16303 Opcode == AMDGPUISD::MUL_I24);
16304}
16305
16306static std::optional<bool>
16308 ByteProvider<SDValue> &Src1, const SDValue &S0Op,
16309 const SDValue &S1Op, const SelectionDAG &DAG) {
16310 // If we both ops are i8s (pre legalize-dag), then the signedness semantics
16311 // of the dot4 is irrelevant.
16312 if (S0Op.getValueSizeInBits() == 8 && S1Op.getValueSizeInBits() == 8)
16313 return false;
16314
16315 auto Known0 = DAG.computeKnownBits(S0Op, 0);
16316 bool S0IsUnsigned = Known0.countMinLeadingZeros() > 0;
16317 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
16318 auto Known1 = DAG.computeKnownBits(S1Op, 0);
16319 bool S1IsUnsigned = Known1.countMinLeadingZeros() > 0;
16320 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
16321
16322 assert(!(S0IsUnsigned && S0IsSigned));
16323 assert(!(S1IsUnsigned && S1IsSigned));
16324
16325 // There are 9 possible permutations of
16326 // {S0IsUnsigned, S0IsSigned, S1IsUnsigned, S1IsSigned}
16327
16328 // In two permutations, the sign bits are known to be the same for both Ops,
16329 // so simply return Signed / Unsigned corresponding to the MSB
16330
16331 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
16332 return S0IsSigned;
16333
16334 // In another two permutations, the sign bits are known to be opposite. In
16335 // this case return std::nullopt to indicate a bad match.
16336
16337 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
16338 return std::nullopt;
16339
16340 // In the remaining five permutations, we don't know the value of the sign
16341 // bit for at least one Op. Since we have a valid ByteProvider, we know that
16342 // the upper bits must be extension bits. Thus, the only ways for the sign
16343 // bit to be unknown is if it was sign extended from unknown value, or if it
16344 // was any extended. In either case, it is correct to use the signed
16345 // version of the signedness semantics of dot4
16346
16347 // In two of such permutations, we known the sign bit is set for
16348 // one op, and the other is unknown. It is okay to used signed version of
16349 // dot4.
16350 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
16351 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
16352 return true;
16353
16354 // In one such permutation, we don't know either of the sign bits. It is okay
16355 // to used the signed version of dot4.
16356 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
16357 return true;
16358
16359 // In two of such permutations, we known the sign bit is unset for
16360 // one op, and the other is unknown. Return std::nullopt to indicate a
16361 // bad match.
16362 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
16363 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
16364 return std::nullopt;
16365
16366 llvm_unreachable("Fully covered condition");
16367}
16368
16369SDValue SITargetLowering::performAddCombine(SDNode *N,
16370 DAGCombinerInfo &DCI) const {
16371 SelectionDAG &DAG = DCI.DAG;
16372 EVT VT = N->getValueType(0);
16373 SDLoc SL(N);
16374 SDValue LHS = N->getOperand(0);
16375 SDValue RHS = N->getOperand(1);
16376
16377 if (LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL) {
16378 if (Subtarget->hasMad64_32()) {
16379 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
16380 return Folded;
16381 }
16382 }
16383
16384 if (SDValue V = reassociateScalarOps(N, DAG)) {
16385 return V;
16386 }
16387
16388 if (VT == MVT::i64) {
16389 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
16390 return Folded;
16391 }
16392
16393 if ((isMul(LHS) || isMul(RHS)) && Subtarget->hasDot7Insts() &&
16394 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
16395 SDValue TempNode(N, 0);
16396 std::optional<bool> IsSigned;
16400
16401 // Match the v_dot4 tree, while collecting src nodes.
16402 int ChainLength = 0;
16403 for (int I = 0; I < 4; I++) {
16404 auto MulIdx = isMul(LHS) ? 0 : isMul(RHS) ? 1 : -1;
16405 if (MulIdx == -1)
16406 break;
16407 auto Src0 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
16408 if (!Src0)
16409 break;
16410 auto Src1 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
16411 if (!Src1)
16412 break;
16413
16414 auto IterIsSigned = checkDot4MulSignedness(
16415 TempNode->getOperand(MulIdx), *Src0, *Src1,
16416 TempNode->getOperand(MulIdx)->getOperand(0),
16417 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
16418 if (!IterIsSigned)
16419 break;
16420 if (!IsSigned)
16421 IsSigned = *IterIsSigned;
16422 if (*IterIsSigned != *IsSigned)
16423 break;
16424 placeSources(*Src0, *Src1, Src0s, Src1s, I);
16425 auto AddIdx = 1 - MulIdx;
16426 // Allow the special case where add (add (mul24, 0), mul24) became ->
16427 // add (mul24, mul24).
16428 if (I == 2 && isMul(TempNode->getOperand(AddIdx))) {
16429 Src2s.push_back(TempNode->getOperand(AddIdx));
16430 auto Src0 =
16431 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(0));
16432 if (!Src0)
16433 break;
16434 auto Src1 =
16435 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(1));
16436 if (!Src1)
16437 break;
16438 auto IterIsSigned = checkDot4MulSignedness(
16439 TempNode->getOperand(AddIdx), *Src0, *Src1,
16440 TempNode->getOperand(AddIdx)->getOperand(0),
16441 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
16442 if (!IterIsSigned)
16443 break;
16444 assert(IsSigned);
16445 if (*IterIsSigned != *IsSigned)
16446 break;
16447 placeSources(*Src0, *Src1, Src0s, Src1s, I + 1);
16448 Src2s.push_back(DAG.getConstant(0, SL, MVT::i32));
16449 ChainLength = I + 2;
16450 break;
16451 }
16452
16453 TempNode = TempNode->getOperand(AddIdx);
16454 Src2s.push_back(TempNode);
16455 ChainLength = I + 1;
16456 if (TempNode->getNumOperands() < 2)
16457 break;
16458 LHS = TempNode->getOperand(0);
16459 RHS = TempNode->getOperand(1);
16460 }
16461
16462 if (ChainLength < 2)
16463 return SDValue();
16464
16465 // Masks were constructed with assumption that we would find a chain of
16466 // length 4. If not, then we need to 0 out the MSB bits (via perm mask of
16467 // 0x0c) so they do not affect dot calculation.
16468 if (ChainLength < 4) {
16469 fixMasks(Src0s, ChainLength);
16470 fixMasks(Src1s, ChainLength);
16471 }
16472
16473 SDValue Src0, Src1;
16474
16475 // If we are just using a single source for both, and have permuted the
16476 // bytes consistently, we can just use the sources without permuting
16477 // (commutation).
16478 bool UseOriginalSrc = false;
16479 if (ChainLength == 4 && Src0s.size() == 1 && Src1s.size() == 1 &&
16480 Src0s.begin()->PermMask == Src1s.begin()->PermMask &&
16481 Src0s.begin()->SrcOp.getValueSizeInBits() >= 32 &&
16482 Src1s.begin()->SrcOp.getValueSizeInBits() >= 32) {
16483 SmallVector<unsigned, 4> SrcBytes;
16484 auto Src0Mask = Src0s.begin()->PermMask;
16485 SrcBytes.push_back(Src0Mask & 0xFF000000);
16486 bool UniqueEntries = true;
16487 for (auto I = 1; I < 4; I++) {
16488 auto NextByte = Src0Mask & (0xFF << ((3 - I) * 8));
16489
16490 if (is_contained(SrcBytes, NextByte)) {
16491 UniqueEntries = false;
16492 break;
16493 }
16494 SrcBytes.push_back(NextByte);
16495 }
16496
16497 if (UniqueEntries) {
16498 UseOriginalSrc = true;
16499
16500 auto *FirstElt = Src0s.begin();
16501 auto FirstEltOp =
16502 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
16503
16504 auto *SecondElt = Src1s.begin();
16505 auto SecondEltOp = getDWordFromOffset(DAG, SL, SecondElt->SrcOp,
16506 SecondElt->DWordOffset);
16507
16508 Src0 = DAG.getBitcastedAnyExtOrTrunc(FirstEltOp, SL,
16509 MVT::getIntegerVT(32));
16510 Src1 = DAG.getBitcastedAnyExtOrTrunc(SecondEltOp, SL,
16511 MVT::getIntegerVT(32));
16512 }
16513 }
16514
16515 if (!UseOriginalSrc) {
16516 Src0 = resolveSources(DAG, SL, Src0s, false, true);
16517 Src1 = resolveSources(DAG, SL, Src1s, false, true);
16518 }
16519
16520 assert(IsSigned);
16521 SDValue Src2 =
16522 DAG.getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
16523
16524 SDValue IID = DAG.getTargetConstant(*IsSigned ? Intrinsic::amdgcn_sdot4
16525 : Intrinsic::amdgcn_udot4,
16526 SL, MVT::i64);
16527
16528 assert(!VT.isVector());
16529 auto Dot = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, IID, Src0,
16530 Src1, Src2, DAG.getTargetConstant(0, SL, MVT::i1));
16531
16532 return DAG.getExtOrTrunc(*IsSigned, Dot, SL, VT);
16533 }
16534
16535 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
16536 return SDValue();
16537
16538 // add x, zext (setcc) => uaddo_carry x, 0, setcc
16539 // add x, sext (setcc) => usubo_carry x, 0, setcc
16540 unsigned Opc = LHS.getOpcode();
16543 std::swap(RHS, LHS);
16544
16545 Opc = RHS.getOpcode();
16546 switch (Opc) {
16547 default:
16548 break;
16549 case ISD::ZERO_EXTEND:
16550 case ISD::SIGN_EXTEND:
16551 case ISD::ANY_EXTEND: {
16552 auto Cond = RHS.getOperand(0);
16553 // If this won't be a real VOPC output, we would still need to insert an
16554 // extra instruction anyway.
16555 if (!isBoolSGPR(Cond))
16556 break;
16557 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
16558 SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond};
16560 return DAG.getNode(Opc, SL, VTList, Args);
16561 }
16562 case ISD::UADDO_CARRY: {
16563 // add x, (uaddo_carry y, 0, cc) => uaddo_carry x, y, cc
16564 if (!isNullConstant(RHS.getOperand(1)))
16565 break;
16566 SDValue Args[] = {LHS, RHS.getOperand(0), RHS.getOperand(2)};
16567 return DAG.getNode(ISD::UADDO_CARRY, SDLoc(N), RHS->getVTList(), Args);
16568 }
16569 }
16570 return SDValue();
16571}
16572
16573SDValue SITargetLowering::performPtrAddCombine(SDNode *N,
16574 DAGCombinerInfo &DCI) const {
16575 SelectionDAG &DAG = DCI.DAG;
16576 SDLoc DL(N);
16577 EVT VT = N->getValueType(0);
16578 SDValue N0 = N->getOperand(0);
16579 SDValue N1 = N->getOperand(1);
16580
16581 // The following folds transform PTRADDs into regular arithmetic in cases
16582 // where the PTRADD wouldn't be folded as an immediate offset into memory
16583 // instructions anyway. They are target-specific in that other targets might
16584 // prefer to not lose information about the pointer arithmetic.
16585
16586 // Fold (ptradd x, shl(0 - v, k)) -> sub(x, shl(v, k)).
16587 // Adapted from DAGCombiner::visitADDLikeCommutative.
16588 SDValue V, K;
16589 if (sd_match(N1, m_Shl(m_Neg(m_Value(V)), m_Value(K)))) {
16590 SDNodeFlags ShlFlags = N1->getFlags();
16591 // If the original shl is NUW and NSW, the first k+1 bits of 0-v are all 0,
16592 // so v is either 0 or the first k+1 bits of v are all 1 -> NSW can be
16593 // preserved.
16594 SDNodeFlags NewShlFlags =
16595 ShlFlags.hasNoUnsignedWrap() && ShlFlags.hasNoSignedWrap()
16597 : SDNodeFlags();
16598 SDValue Inner = DAG.getNode(ISD::SHL, DL, VT, V, K, NewShlFlags);
16599 DCI.AddToWorklist(Inner.getNode());
16600 return DAG.getNode(ISD::SUB, DL, VT, N0, Inner);
16601 }
16602
16603 // Fold into Mad64 if the right-hand side is a MUL. Analogous to a fold in
16604 // performAddCombine.
16605 if (N1.getOpcode() == ISD::MUL) {
16606 if (Subtarget->hasMad64_32()) {
16607 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
16608 return Folded;
16609 }
16610 }
16611
16612 // If the 32 low bits of the constant are all zero, there is nothing to fold
16613 // into an immediate offset, so it's better to eliminate the unnecessary
16614 // addition for the lower 32 bits than to preserve the PTRADD.
16615 // Analogous to a fold in performAddCombine.
16616 if (VT == MVT::i64) {
16617 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
16618 return Folded;
16619 }
16620
16621 if (N1.getOpcode() != ISD::ADD || !N1.hasOneUse())
16622 return SDValue();
16623
16624 SDValue X = N0;
16625 SDValue Y = N1.getOperand(0);
16626 SDValue Z = N1.getOperand(1);
16627 bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y);
16628 bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z);
16629
16630 if (!YIsConstant && !ZIsConstant && !X->isDivergent() &&
16631 Y->isDivergent() != Z->isDivergent()) {
16632 // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if x and
16633 // y are uniform and z isn't.
16634 // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if x and
16635 // z are uniform and y isn't.
16636 // The goal is to push uniform operands up in the computation, so that they
16637 // can be handled with scalar operations. We can't use reassociateScalarOps
16638 // for this since it requires two identical commutative operations to
16639 // reassociate.
16640 if (Y->isDivergent())
16641 std::swap(Y, Z);
16642 // If both additions in the original were NUW, reassociation preserves that.
16643 SDNodeFlags ReassocFlags =
16644 (N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap;
16645 SDValue UniformInner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags);
16646 DCI.AddToWorklist(UniformInner.getNode());
16647 return DAG.getMemBasePlusOffset(UniformInner, Z, DL, ReassocFlags);
16648 }
16649
16650 return SDValue();
16651}
16652
16653SDValue SITargetLowering::performSubCombine(SDNode *N,
16654 DAGCombinerInfo &DCI) const {
16655 SelectionDAG &DAG = DCI.DAG;
16656 EVT VT = N->getValueType(0);
16657
16658 if (VT == MVT::i64) {
16659 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
16660 return Folded;
16661 }
16662
16663 if (VT != MVT::i32)
16664 return SDValue();
16665
16666 SDLoc SL(N);
16667 SDValue LHS = N->getOperand(0);
16668 SDValue RHS = N->getOperand(1);
16669
16670 // sub x, zext (setcc) => usubo_carry x, 0, setcc
16671 // sub x, sext (setcc) => uaddo_carry x, 0, setcc
16672 unsigned Opc = RHS.getOpcode();
16673 switch (Opc) {
16674 default:
16675 break;
16676 case ISD::ZERO_EXTEND:
16677 case ISD::SIGN_EXTEND:
16678 case ISD::ANY_EXTEND: {
16679 auto Cond = RHS.getOperand(0);
16680 // If this won't be a real VOPC output, we would still need to insert an
16681 // extra instruction anyway.
16682 if (!isBoolSGPR(Cond))
16683 break;
16684 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
16685 SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond};
16687 return DAG.getNode(Opc, SL, VTList, Args);
16688 }
16689 }
16690
16691 if (LHS.getOpcode() == ISD::USUBO_CARRY) {
16692 // sub (usubo_carry x, 0, cc), y => usubo_carry x, y, cc
16693 if (!isNullConstant(LHS.getOperand(1)))
16694 return SDValue();
16695 SDValue Args[] = {LHS.getOperand(0), RHS, LHS.getOperand(2)};
16696 return DAG.getNode(ISD::USUBO_CARRY, SDLoc(N), LHS->getVTList(), Args);
16697 }
16698 return SDValue();
16699}
16700
16701SDValue
16702SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
16703 DAGCombinerInfo &DCI) const {
16704
16705 if (N->getValueType(0) != MVT::i32)
16706 return SDValue();
16707
16708 if (!isNullConstant(N->getOperand(1)))
16709 return SDValue();
16710
16711 SelectionDAG &DAG = DCI.DAG;
16712 SDValue LHS = N->getOperand(0);
16713
16714 // uaddo_carry (add x, y), 0, cc => uaddo_carry x, y, cc
16715 // usubo_carry (sub x, y), 0, cc => usubo_carry x, y, cc
16716 unsigned LHSOpc = LHS.getOpcode();
16717 unsigned Opc = N->getOpcode();
16718 if ((LHSOpc == ISD::ADD && Opc == ISD::UADDO_CARRY) ||
16719 (LHSOpc == ISD::SUB && Opc == ISD::USUBO_CARRY)) {
16720 SDValue Args[] = {LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2)};
16721 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args);
16722 }
16723 return SDValue();
16724}
16725
16726SDValue SITargetLowering::performFAddCombine(SDNode *N,
16727 DAGCombinerInfo &DCI) const {
16728 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
16729 return SDValue();
16730
16731 SelectionDAG &DAG = DCI.DAG;
16732 EVT VT = N->getValueType(0);
16733
16734 SDLoc SL(N);
16735 SDValue LHS = N->getOperand(0);
16736 SDValue RHS = N->getOperand(1);
16737
16738 // These should really be instruction patterns, but writing patterns with
16739 // source modifiers is a pain.
16740
16741 // fadd (fadd (a, a), b) -> mad 2.0, a, b
16742 if (LHS.getOpcode() == ISD::FADD) {
16743 SDValue A = LHS.getOperand(0);
16744 if (A == LHS.getOperand(1)) {
16745 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
16746 if (FusedOp != 0) {
16747 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
16748 return DAG.getNode(FusedOp, SL, VT, A, Two, RHS);
16749 }
16750 }
16751 }
16752
16753 // fadd (b, fadd (a, a)) -> mad 2.0, a, b
16754 if (RHS.getOpcode() == ISD::FADD) {
16755 SDValue A = RHS.getOperand(0);
16756 if (A == RHS.getOperand(1)) {
16757 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
16758 if (FusedOp != 0) {
16759 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
16760 return DAG.getNode(FusedOp, SL, VT, A, Two, LHS);
16761 }
16762 }
16763 }
16764
16765 return SDValue();
16766}
16767
16768SDValue SITargetLowering::performFSubCombine(SDNode *N,
16769 DAGCombinerInfo &DCI) const {
16770 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
16771 return SDValue();
16772
16773 SelectionDAG &DAG = DCI.DAG;
16774 SDLoc SL(N);
16775 EVT VT = N->getValueType(0);
16776 assert(!VT.isVector());
16777
16778 // Try to get the fneg to fold into the source modifier. This undoes generic
16779 // DAG combines and folds them into the mad.
16780 //
16781 // Only do this if we are not trying to support denormals. v_mad_f32 does
16782 // not support denormals ever.
16783 SDValue LHS = N->getOperand(0);
16784 SDValue RHS = N->getOperand(1);
16785 if (LHS.getOpcode() == ISD::FADD) {
16786 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
16787 SDValue A = LHS.getOperand(0);
16788 if (A == LHS.getOperand(1)) {
16789 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
16790 if (FusedOp != 0) {
16791 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
16792 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
16793
16794 return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS);
16795 }
16796 }
16797 }
16798
16799 if (RHS.getOpcode() == ISD::FADD) {
16800 // (fsub c, (fadd a, a)) -> mad -2.0, a, c
16801
16802 SDValue A = RHS.getOperand(0);
16803 if (A == RHS.getOperand(1)) {
16804 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
16805 if (FusedOp != 0) {
16806 const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT);
16807 return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS);
16808 }
16809 }
16810 }
16811
16812 return SDValue();
16813}
16814
16815SDValue SITargetLowering::performFDivCombine(SDNode *N,
16816 DAGCombinerInfo &DCI) const {
16817 SelectionDAG &DAG = DCI.DAG;
16818 SDLoc SL(N);
16819 EVT VT = N->getValueType(0);
16820
16821 // fsqrt legality correlates to rsq availability.
16822 if ((VT != MVT::f16 && VT != MVT::bf16) || !isOperationLegal(ISD::FSQRT, VT))
16823 return SDValue();
16824
16825 SDValue LHS = N->getOperand(0);
16826 SDValue RHS = N->getOperand(1);
16827
16828 SDNodeFlags Flags = N->getFlags();
16829 SDNodeFlags RHSFlags = RHS->getFlags();
16830 if (!Flags.hasAllowContract() || !RHSFlags.hasAllowContract() ||
16831 !RHS->hasOneUse())
16832 return SDValue();
16833
16834 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
16835 bool IsNegative = false;
16836 if (CLHS->isExactlyValue(1.0) ||
16837 (IsNegative = CLHS->isExactlyValue(-1.0))) {
16838 // fdiv contract 1.0, (sqrt contract x) -> rsq for f16
16839 // fdiv contract -1.0, (sqrt contract x) -> fneg(rsq) for f16
16840 if (RHS.getOpcode() == ISD::FSQRT) {
16841 // TODO: Or in RHS flags, somehow missing from SDNodeFlags
16842 SDValue Rsq =
16843 DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0), Flags);
16844 return IsNegative ? DAG.getNode(ISD::FNEG, SL, VT, Rsq, Flags) : Rsq;
16845 }
16846 }
16847 }
16848
16849 return SDValue();
16850}
16851
16852SDValue SITargetLowering::performFMulCombine(SDNode *N,
16853 DAGCombinerInfo &DCI) const {
16854 SelectionDAG &DAG = DCI.DAG;
16855 EVT VT = N->getValueType(0);
16856 EVT ScalarVT = VT.getScalarType();
16857 EVT IntVT = VT.changeElementType(*DAG.getContext(), MVT::i32);
16858
16859 if (!N->isDivergent() && getSubtarget()->hasSALUFloatInsts() &&
16860 (ScalarVT == MVT::f32 || ScalarVT == MVT::f16)) {
16861 // Prefer to use s_mul_f16/f32 instead of v_ldexp_f16/f32.
16862 return SDValue();
16863 }
16864
16865 SDValue LHS = N->getOperand(0);
16866 SDValue RHS = N->getOperand(1);
16867
16868 // It is cheaper to realize i32 inline constants as compared against
16869 // materializing f16 or f64 (or even non-inline f32) values,
16870 // possible via ldexp usage, as shown below :
16871 //
16872 // Given : A = 2^a & B = 2^b ; where a and b are integers.
16873 // fmul x, (select y, A, B) -> ldexp( x, (select i32 y, a, b) )
16874 // fmul x, (select y, -A, -B) -> ldexp( (fneg x), (select i32 y, a, b) )
16875 if ((ScalarVT == MVT::f64 || ScalarVT == MVT::f32 || ScalarVT == MVT::f16) &&
16876 (RHS.hasOneUse() && RHS.getOpcode() == ISD::SELECT)) {
16877 const ConstantFPSDNode *TrueNode = isConstOrConstSplatFP(RHS.getOperand(1));
16878 if (!TrueNode)
16879 return SDValue();
16880 const ConstantFPSDNode *FalseNode =
16881 isConstOrConstSplatFP(RHS.getOperand(2));
16882 if (!FalseNode)
16883 return SDValue();
16884
16885 if (TrueNode->isNegative() != FalseNode->isNegative())
16886 return SDValue();
16887
16888 // For f32, only non-inline constants should be transformed.
16889 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
16890 if (ScalarVT == MVT::f32 &&
16891 TII->isInlineConstant(TrueNode->getValueAPF()) &&
16892 TII->isInlineConstant(FalseNode->getValueAPF()))
16893 return SDValue();
16894
16895 int TrueNodeExpVal = TrueNode->getValueAPF().getExactLog2Abs();
16896 if (TrueNodeExpVal == INT_MIN)
16897 return SDValue();
16898 int FalseNodeExpVal = FalseNode->getValueAPF().getExactLog2Abs();
16899 if (FalseNodeExpVal == INT_MIN)
16900 return SDValue();
16901
16902 SDLoc SL(N);
16903 SDValue SelectNode =
16904 DAG.getNode(ISD::SELECT, SL, IntVT, RHS.getOperand(0),
16905 DAG.getSignedConstant(TrueNodeExpVal, SL, IntVT),
16906 DAG.getSignedConstant(FalseNodeExpVal, SL, IntVT));
16907
16908 LHS = TrueNode->isNegative()
16909 ? DAG.getNode(ISD::FNEG, SL, VT, LHS, LHS->getFlags())
16910 : LHS;
16911
16912 return DAG.getNode(ISD::FLDEXP, SL, VT, LHS, SelectNode, N->getFlags());
16913 }
16914
16915 return SDValue();
16916}
16917
16918SDValue SITargetLowering::performFMACombine(SDNode *N,
16919 DAGCombinerInfo &DCI) const {
16920 SelectionDAG &DAG = DCI.DAG;
16921 EVT VT = N->getValueType(0);
16922 SDLoc SL(N);
16923
16924 if (!Subtarget->hasDot10Insts() || VT != MVT::f32)
16925 return SDValue();
16926
16927 // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
16928 // FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
16929 SDValue Op1 = N->getOperand(0);
16930 SDValue Op2 = N->getOperand(1);
16931 SDValue FMA = N->getOperand(2);
16932
16933 if (FMA.getOpcode() != ISD::FMA || Op1.getOpcode() != ISD::FP_EXTEND ||
16934 Op2.getOpcode() != ISD::FP_EXTEND)
16935 return SDValue();
16936
16937 // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
16938 // regardless of the denorm mode setting. Therefore,
16939 // fp-contract is sufficient to allow generating fdot2.
16940 const TargetOptions &Options = DAG.getTarget().Options;
16941 if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
16942 (N->getFlags().hasAllowContract() &&
16943 FMA->getFlags().hasAllowContract())) {
16944 Op1 = Op1.getOperand(0);
16945 Op2 = Op2.getOperand(0);
16946 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
16948 return SDValue();
16949
16950 SDValue Vec1 = Op1.getOperand(0);
16951 SDValue Idx1 = Op1.getOperand(1);
16952 SDValue Vec2 = Op2.getOperand(0);
16953
16954 SDValue FMAOp1 = FMA.getOperand(0);
16955 SDValue FMAOp2 = FMA.getOperand(1);
16956 SDValue FMAAcc = FMA.getOperand(2);
16957
16958 if (FMAOp1.getOpcode() != ISD::FP_EXTEND ||
16959 FMAOp2.getOpcode() != ISD::FP_EXTEND)
16960 return SDValue();
16961
16962 FMAOp1 = FMAOp1.getOperand(0);
16963 FMAOp2 = FMAOp2.getOperand(0);
16964 if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
16966 return SDValue();
16967
16968 SDValue Vec3 = FMAOp1.getOperand(0);
16969 SDValue Vec4 = FMAOp2.getOperand(0);
16970 SDValue Idx2 = FMAOp1.getOperand(1);
16971
16972 if (Idx1 != Op2.getOperand(1) || Idx2 != FMAOp2.getOperand(1) ||
16973 // Idx1 and Idx2 cannot be the same.
16974 Idx1 == Idx2)
16975 return SDValue();
16976
16977 if (Vec1 == Vec2 || Vec3 == Vec4)
16978 return SDValue();
16979
16980 if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16)
16981 return SDValue();
16982
16983 if ((Vec1 == Vec3 && Vec2 == Vec4) || (Vec1 == Vec4 && Vec2 == Vec3)) {
16984 return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
16985 DAG.getTargetConstant(0, SL, MVT::i1));
16986 }
16987 }
16988 return SDValue();
16989}
16990
16991SDValue SITargetLowering::performSetCCCombine(SDNode *N,
16992 DAGCombinerInfo &DCI) const {
16993 SelectionDAG &DAG = DCI.DAG;
16994 SDLoc SL(N);
16995
16996 SDValue LHS = N->getOperand(0);
16997 SDValue RHS = N->getOperand(1);
16998 EVT VT = LHS.getValueType();
16999 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
17000
17001 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
17002 if (!CRHS) {
17004 if (CRHS) {
17005 std::swap(LHS, RHS);
17006 CC = getSetCCSwappedOperands(CC);
17007 }
17008 }
17009
17010 if (CRHS) {
17011 if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
17012 isBoolSGPR(LHS.getOperand(0))) {
17013 // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
17014 // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
17015 // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
17016 // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc
17017 if ((CRHS->isAllOnes() &&
17018 (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
17019 (CRHS->isZero() &&
17020 (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
17021 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
17022 DAG.getAllOnesConstant(SL, MVT::i1));
17023 if ((CRHS->isAllOnes() &&
17024 (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
17025 (CRHS->isZero() &&
17026 (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
17027 return LHS.getOperand(0);
17028 }
17029
17030 const APInt &CRHSVal = CRHS->getAPIntValue();
17031 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
17032 LHS.getOpcode() == ISD::SELECT &&
17033 isa<ConstantSDNode>(LHS.getOperand(1)) &&
17034 isa<ConstantSDNode>(LHS.getOperand(2)) &&
17035 isBoolSGPR(LHS.getOperand(0))) {
17036 // Given CT != FT:
17037 // setcc (select cc, CT, CF), CF, eq => xor cc, -1
17038 // setcc (select cc, CT, CF), CF, ne => cc
17039 // setcc (select cc, CT, CF), CT, ne => xor cc, -1
17040 // setcc (select cc, CT, CF), CT, eq => cc
17041 const APInt &CT = LHS.getConstantOperandAPInt(1);
17042 const APInt &CF = LHS.getConstantOperandAPInt(2);
17043
17044 if (CT != CF) {
17045 if ((CF == CRHSVal && CC == ISD::SETEQ) ||
17046 (CT == CRHSVal && CC == ISD::SETNE))
17047 return DAG.getNOT(SL, LHS.getOperand(0), MVT::i1);
17048 if ((CF == CRHSVal && CC == ISD::SETNE) ||
17049 (CT == CRHSVal && CC == ISD::SETEQ))
17050 return LHS.getOperand(0);
17051 }
17052 }
17053 }
17054
17055 // Eliminate setcc by using carryout from add/sub instruction
17056
17057 // LHS = ADD i64 RHS, Z LHSlo = UADDO i32 RHSlo, Zlo
17058 // setcc LHS ult RHS -> LHSHi = UADDO_CARRY i32 RHShi, Zhi
17059 // similarly for subtraction
17060
17061 // LHS = ADD i64 Y, 1 LHSlo = UADDO i32 Ylo, 1
17062 // setcc LHS eq 0 -> LHSHi = UADDO_CARRY i32 Yhi, 0
17063
17064 if (VT == MVT::i64 && ((CC == ISD::SETULT &&
17066 (CC == ISD::SETUGT &&
17068 (CC == ISD::SETEQ && CRHS && CRHS->isZero() &&
17069 sd_match(LHS, m_Add(m_Value(), m_One()))))) {
17070 bool IsAdd = LHS.getOpcode() == ISD::ADD;
17071
17072 SDValue Op0 = LHS.getOperand(0);
17073 SDValue Op1 = LHS.getOperand(1);
17074
17075 SDValue Op0Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Op0);
17076 SDValue Op1Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Op1);
17077
17078 SDValue Op0Hi = getHiHalf64(Op0, DAG);
17079 SDValue Op1Hi = getHiHalf64(Op1, DAG);
17080
17081 SDValue NodeLo =
17082 DAG.getNode(IsAdd ? ISD::UADDO : ISD::USUBO, SL,
17083 DAG.getVTList(MVT::i32, MVT::i1), {Op0Lo, Op1Lo});
17084
17085 SDValue CarryInHi = NodeLo.getValue(1);
17086 SDValue NodeHi = DAG.getNode(IsAdd ? ISD::UADDO_CARRY : ISD::USUBO_CARRY,
17087 SL, DAG.getVTList(MVT::i32, MVT::i1),
17088 {Op0Hi, Op1Hi, CarryInHi});
17089
17090 SDValue ResultLo = NodeLo.getValue(0);
17091 SDValue ResultHi = NodeHi.getValue(0);
17092
17093 SDValue JoinedResult =
17094 DAG.getBuildVector(MVT::v2i32, SL, {ResultLo, ResultHi});
17095
17096 SDValue Result = DAG.getNode(ISD::BITCAST, SL, VT, JoinedResult);
17097 SDValue Overflow = NodeHi.getValue(1);
17098 DCI.CombineTo(LHS.getNode(), Result);
17099 return Overflow;
17100 }
17101
17102 if (VT != MVT::f32 && VT != MVT::f64 &&
17103 (!Subtarget->has16BitInsts() || VT != MVT::f16))
17104 return SDValue();
17105
17106 // Match isinf/isfinite pattern
17107 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
17108 // (fcmp one (fabs x), inf) -> (fp_class x,
17109 // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)
17110 if ((CC == ISD::SETOEQ || CC == ISD::SETONE) &&
17111 LHS.getOpcode() == ISD::FABS) {
17112 const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
17113 if (!CRHS)
17114 return SDValue();
17115
17116 const APFloat &APF = CRHS->getValueAPF();
17117 if (APF.isInfinity() && !APF.isNegative()) {
17118 const unsigned IsInfMask =
17120 const unsigned IsFiniteMask =
17124 unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask;
17125 return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
17126 DAG.getConstant(Mask, SL, MVT::i32));
17127 }
17128 }
17129
17130 return SDValue();
17131}
17132
17133SDValue
17134SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
17135 DAGCombinerInfo &DCI) const {
17136 SelectionDAG &DAG = DCI.DAG;
17137 SDLoc SL(N);
17138 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
17139
17140 SDValue Src = N->getOperand(0);
17141 SDValue Shift = N->getOperand(0);
17142
17143 // TODO: Extend type shouldn't matter (assuming legal types).
17144 if (Shift.getOpcode() == ISD::ZERO_EXTEND)
17145 Shift = Shift.getOperand(0);
17146
17147 if (Shift.getOpcode() == ISD::SRL || Shift.getOpcode() == ISD::SHL) {
17148 // cvt_f32_ubyte1 (shl x, 8) -> cvt_f32_ubyte0 x
17149 // cvt_f32_ubyte3 (shl x, 16) -> cvt_f32_ubyte1 x
17150 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
17151 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
17152 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
17153 if (auto *C = dyn_cast<ConstantSDNode>(Shift.getOperand(1))) {
17154 SDValue Shifted = DAG.getZExtOrTrunc(
17155 Shift.getOperand(0), SDLoc(Shift.getOperand(0)), MVT::i32);
17156
17157 unsigned ShiftOffset = 8 * Offset;
17158 if (Shift.getOpcode() == ISD::SHL)
17159 ShiftOffset -= C->getZExtValue();
17160 else
17161 ShiftOffset += C->getZExtValue();
17162
17163 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
17164 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL,
17165 MVT::f32, Shifted);
17166 }
17167 }
17168 }
17169
17170 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
17171 APInt DemandedBits = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
17172 if (TLI.SimplifyDemandedBits(Src, DemandedBits, DCI)) {
17173 // We simplified Src. If this node is not dead, visit it again so it is
17174 // folded properly.
17175 if (N->getOpcode() != ISD::DELETED_NODE)
17176 DCI.AddToWorklist(N);
17177 return SDValue(N, 0);
17178 }
17179
17180 // Handle (or x, (srl y, 8)) pattern when known bits are zero.
17181 if (SDValue DemandedSrc =
17182 TLI.SimplifyMultipleUseDemandedBits(Src, DemandedBits, DAG))
17183 return DAG.getNode(N->getOpcode(), SL, MVT::f32, DemandedSrc);
17184
17185 return SDValue();
17186}
17187
17188SDValue SITargetLowering::performClampCombine(SDNode *N,
17189 DAGCombinerInfo &DCI) const {
17190 ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
17191 if (!CSrc)
17192 return SDValue();
17193
17194 const MachineFunction &MF = DCI.DAG.getMachineFunction();
17195 const APFloat &F = CSrc->getValueAPF();
17196 APFloat Zero = APFloat::getZero(F.getSemantics());
17197 if (F < Zero ||
17198 (F.isNaN() && MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
17199 return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
17200 }
17201
17202 APFloat One(F.getSemantics(), "1.0");
17203 if (F > One)
17204 return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
17205
17206 return SDValue(CSrc, 0);
17207}
17208
17209SDValue SITargetLowering::performSelectCombine(SDNode *N,
17210 DAGCombinerInfo &DCI) const {
17211
17212 // Try to fold CMP + SELECT patterns with shared constants (both FP and
17213 // integer).
17214 // Detect when CMP and SELECT use the same constant and fold them to avoid
17215 // loading the constant twice. Specifically handles patterns like:
17216 // %cmp = icmp eq i32 %val, 4242
17217 // %sel = select i1 %cmp, i32 4242, i32 %other
17218 // It can be optimized to reuse %val instead of 4242 in select.
17219 SDValue Cond = N->getOperand(0);
17220 SDValue TrueVal = N->getOperand(1);
17221 SDValue FalseVal = N->getOperand(2);
17222
17223 // Check if condition is a comparison.
17224 if (Cond.getOpcode() != ISD::SETCC)
17225 return SDValue();
17226
17227 SDValue LHS = Cond.getOperand(0);
17228 SDValue RHS = Cond.getOperand(1);
17229 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
17230
17231 bool isFloatingPoint = LHS.getValueType().isFloatingPoint();
17232 bool isInteger = LHS.getValueType().isInteger();
17233
17234 // Handle simple floating-point and integer types only.
17235 if (!isFloatingPoint && !isInteger)
17236 return SDValue();
17237
17238 bool isEquality = CC == (isFloatingPoint ? ISD::SETOEQ : ISD::SETEQ);
17239 bool isNonEquality = CC == (isFloatingPoint ? ISD::SETONE : ISD::SETNE);
17240 if (!isEquality && !isNonEquality)
17241 return SDValue();
17242
17243 SDValue ArgVal, ConstVal;
17244 if ((isFloatingPoint && isa<ConstantFPSDNode>(RHS)) ||
17245 (isInteger && isa<ConstantSDNode>(RHS))) {
17246 ConstVal = RHS;
17247 ArgVal = LHS;
17248 } else if ((isFloatingPoint && isa<ConstantFPSDNode>(LHS)) ||
17249 (isInteger && isa<ConstantSDNode>(LHS))) {
17250 ConstVal = LHS;
17251 ArgVal = RHS;
17252 } else {
17253 return SDValue();
17254 }
17255
17256 // Skip optimization for inlinable immediates.
17257 if (isFloatingPoint) {
17258 const APFloat &Val = cast<ConstantFPSDNode>(ConstVal)->getValueAPF();
17259 if (!Val.isNormal() || Subtarget->getInstrInfo()->isInlineConstant(Val))
17260 return SDValue();
17261 } else {
17263 cast<ConstantSDNode>(ConstVal)->getSExtValue()))
17264 return SDValue();
17265 }
17266
17267 // For equality and non-equality comparisons, patterns:
17268 // select (setcc x, const), const, y -> select (setcc x, const), x, y
17269 // select (setccinv x, const), y, const -> select (setccinv x, const), y, x
17270 if (!(isEquality && TrueVal == ConstVal) &&
17271 !(isNonEquality && FalseVal == ConstVal))
17272 return SDValue();
17273
17274 SDValue SelectLHS = (isEquality && TrueVal == ConstVal) ? ArgVal : TrueVal;
17275 SDValue SelectRHS =
17276 (isNonEquality && FalseVal == ConstVal) ? ArgVal : FalseVal;
17277 return DCI.DAG.getNode(ISD::SELECT, SDLoc(N), N->getValueType(0), Cond,
17278 SelectLHS, SelectRHS);
17279}
17280
17282 DAGCombinerInfo &DCI) const {
17283 switch (N->getOpcode()) {
17284 case ISD::ADD:
17285 case ISD::SUB:
17286 case ISD::SHL:
17287 case ISD::SRL:
17288 case ISD::SRA:
17289 case ISD::AND:
17290 case ISD::OR:
17291 case ISD::XOR:
17292 case ISD::MUL:
17293 case ISD::SETCC:
17294 case ISD::SELECT:
17295 case ISD::SMIN:
17296 case ISD::SMAX:
17297 case ISD::UMIN:
17298 case ISD::UMAX:
17299 if (auto Res = promoteUniformOpToI32(SDValue(N, 0), DCI))
17300 return Res;
17301 break;
17302 default:
17303 break;
17304 }
17305
17306 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
17307 return SDValue();
17308
17309 switch (N->getOpcode()) {
17310 case ISD::ADD:
17311 return performAddCombine(N, DCI);
17312 case ISD::PTRADD:
17313 return performPtrAddCombine(N, DCI);
17314 case ISD::SUB:
17315 return performSubCombine(N, DCI);
17316 case ISD::UADDO_CARRY:
17317 case ISD::USUBO_CARRY:
17318 return performAddCarrySubCarryCombine(N, DCI);
17319 case ISD::FADD:
17320 return performFAddCombine(N, DCI);
17321 case ISD::FSUB:
17322 return performFSubCombine(N, DCI);
17323 case ISD::FDIV:
17324 return performFDivCombine(N, DCI);
17325 case ISD::FMUL:
17326 return performFMulCombine(N, DCI);
17327 case ISD::SETCC:
17328 return performSetCCCombine(N, DCI);
17329 case ISD::SELECT:
17330 if (auto Res = performSelectCombine(N, DCI))
17331 return Res;
17332 break;
17333 case ISD::FMAXNUM:
17334 case ISD::FMINNUM:
17335 case ISD::FMAXNUM_IEEE:
17336 case ISD::FMINNUM_IEEE:
17337 case ISD::FMAXIMUM:
17338 case ISD::FMINIMUM:
17339 case ISD::FMAXIMUMNUM:
17340 case ISD::FMINIMUMNUM:
17341 case ISD::SMAX:
17342 case ISD::SMIN:
17343 case ISD::UMAX:
17344 case ISD::UMIN:
17345 case AMDGPUISD::FMIN_LEGACY:
17346 case AMDGPUISD::FMAX_LEGACY:
17347 return performMinMaxCombine(N, DCI);
17348 case ISD::FMA:
17349 return performFMACombine(N, DCI);
17350 case ISD::AND:
17351 return performAndCombine(N, DCI);
17352 case ISD::OR:
17353 return performOrCombine(N, DCI);
17354 case ISD::FSHR: {
17356 if (N->getValueType(0) == MVT::i32 && N->isDivergent() &&
17357 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
17358 return matchPERM(N, DCI);
17359 }
17360 break;
17361 }
17362 case ISD::XOR:
17363 return performXorCombine(N, DCI);
17364 case ISD::ANY_EXTEND:
17365 case ISD::ZERO_EXTEND:
17366 return performZeroOrAnyExtendCombine(N, DCI);
17368 return performSignExtendInRegCombine(N, DCI);
17369 case AMDGPUISD::FP_CLASS:
17370 return performClassCombine(N, DCI);
17371 case ISD::FCANONICALIZE:
17372 return performFCanonicalizeCombine(N, DCI);
17373 case AMDGPUISD::RCP:
17374 return performRcpCombine(N, DCI);
17375 case ISD::FLDEXP:
17376 case AMDGPUISD::FRACT:
17377 case AMDGPUISD::RSQ:
17378 case AMDGPUISD::RCP_LEGACY:
17379 case AMDGPUISD::RCP_IFLAG:
17380 case AMDGPUISD::RSQ_CLAMP: {
17381 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
17382 SDValue Src = N->getOperand(0);
17383 if (Src.isUndef())
17384 return Src;
17385 break;
17386 }
17387 case ISD::SINT_TO_FP:
17388 case ISD::UINT_TO_FP:
17389 return performUCharToFloatCombine(N, DCI);
17390 case ISD::FCOPYSIGN:
17391 return performFCopySignCombine(N, DCI);
17392 case AMDGPUISD::CVT_F32_UBYTE0:
17393 case AMDGPUISD::CVT_F32_UBYTE1:
17394 case AMDGPUISD::CVT_F32_UBYTE2:
17395 case AMDGPUISD::CVT_F32_UBYTE3:
17396 return performCvtF32UByteNCombine(N, DCI);
17397 case AMDGPUISD::FMED3:
17398 return performFMed3Combine(N, DCI);
17399 case AMDGPUISD::CVT_PKRTZ_F16_F32:
17400 return performCvtPkRTZCombine(N, DCI);
17401 case AMDGPUISD::CLAMP:
17402 return performClampCombine(N, DCI);
17403 case ISD::SCALAR_TO_VECTOR: {
17404 SelectionDAG &DAG = DCI.DAG;
17405 EVT VT = N->getValueType(0);
17406
17407 // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
17408 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
17409 SDLoc SL(N);
17410 SDValue Src = N->getOperand(0);
17411 EVT EltVT = Src.getValueType();
17412 if (EltVT != MVT::i16)
17413 Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);
17414
17415 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);
17416 return DAG.getNode(ISD::BITCAST, SL, VT, Ext);
17417 }
17418
17419 break;
17420 }
17422 return performExtractVectorEltCombine(N, DCI);
17424 return performInsertVectorEltCombine(N, DCI);
17425 case ISD::FP_ROUND:
17426 return performFPRoundCombine(N, DCI);
17427 case ISD::LOAD: {
17428 if (SDValue Widened = widenLoad(cast<LoadSDNode>(N), DCI))
17429 return Widened;
17430 [[fallthrough]];
17431 }
17432 default: {
17433 if (!DCI.isBeforeLegalize()) {
17434 if (MemSDNode *MemNode = dyn_cast<MemSDNode>(N))
17435 return performMemSDNodeCombine(MemNode, DCI);
17436 }
17437
17438 break;
17439 }
17440 }
17441
17443}
17444
17445/// Helper function for adjustWritemask
17446static unsigned SubIdx2Lane(unsigned Idx) {
17447 switch (Idx) {
17448 default:
17449 return ~0u;
17450 case AMDGPU::sub0:
17451 return 0;
17452 case AMDGPU::sub1:
17453 return 1;
17454 case AMDGPU::sub2:
17455 return 2;
17456 case AMDGPU::sub3:
17457 return 3;
17458 case AMDGPU::sub4:
17459 return 4; // Possible with TFE/LWE
17460 }
17461}
17462
17463/// Adjust the writemask of MIMG, VIMAGE or VSAMPLE instructions
17464SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
17465 SelectionDAG &DAG) const {
17466 unsigned Opcode = Node->getMachineOpcode();
17467
17468 // Subtract 1 because the vdata output is not a MachineSDNode operand.
17469 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
17470 if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx))
17471 return Node; // not implemented for D16
17472
17473 SDNode *Users[5] = {nullptr};
17474 unsigned Lane = 0;
17475 unsigned DmaskIdx =
17476 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
17477 unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
17478 unsigned NewDmask = 0;
17479 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
17480 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
17481 bool UsesTFC = (int(TFEIdx) >= 0 && Node->getConstantOperandVal(TFEIdx)) ||
17482 (int(LWEIdx) >= 0 && Node->getConstantOperandVal(LWEIdx));
17483 unsigned TFCLane = 0;
17484 bool HasChain = Node->getNumValues() > 1;
17485
17486 if (OldDmask == 0) {
17487 // These are folded out, but on the chance it happens don't assert.
17488 return Node;
17489 }
17490
17491 unsigned OldBitsSet = llvm::popcount(OldDmask);
17492 // Work out which is the TFE/LWE lane if that is enabled.
17493 if (UsesTFC) {
17494 TFCLane = OldBitsSet;
17495 }
17496
17497 // Try to figure out the used register components
17498 for (SDUse &Use : Node->uses()) {
17499
17500 // Don't look at users of the chain.
17501 if (Use.getResNo() != 0)
17502 continue;
17503
17504 SDNode *User = Use.getUser();
17505
17506 // Abort if we can't understand the usage
17507 if (!User->isMachineOpcode() ||
17508 User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
17509 return Node;
17510
17511 // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
17512 // Note that subregs are packed, i.e. Lane==0 is the first bit set
17513 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
17514 // set, etc.
17515 Lane = SubIdx2Lane(User->getConstantOperandVal(1));
17516 if (Lane == ~0u)
17517 return Node;
17518
17519 // Check if the use is for the TFE/LWE generated result at VGPRn+1.
17520 if (UsesTFC && Lane == TFCLane) {
17521 Users[Lane] = User;
17522 } else {
17523 // Set which texture component corresponds to the lane.
17524 unsigned Comp;
17525 for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
17526 Comp = llvm::countr_zero(Dmask);
17527 Dmask &= ~(1 << Comp);
17528 }
17529
17530 // Abort if we have more than one user per component.
17531 if (Users[Lane])
17532 return Node;
17533
17534 Users[Lane] = User;
17535 NewDmask |= 1 << Comp;
17536 }
17537 }
17538
17539 // Don't allow 0 dmask, as hardware assumes one channel enabled.
17540 bool NoChannels = !NewDmask;
17541 if (NoChannels) {
17542 if (!UsesTFC) {
17543 // No uses of the result and not using TFC. Then do nothing.
17544 return Node;
17545 }
17546 // If the original dmask has one channel - then nothing to do
17547 if (OldBitsSet == 1)
17548 return Node;
17549 // Use an arbitrary dmask - required for the instruction to work
17550 NewDmask = 1;
17551 }
17552 // Abort if there's no change
17553 if (NewDmask == OldDmask)
17554 return Node;
17555
17556 unsigned BitsSet = llvm::popcount(NewDmask);
17557
17558 // Check for TFE or LWE - increase the number of channels by one to account
17559 // for the extra return value
17560 // This will need adjustment for D16 if this is also included in
17561 // adjustWriteMask (this function) but at present D16 are excluded.
17562 unsigned NewChannels = BitsSet + UsesTFC;
17563
17564 int NewOpcode =
17565 AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), NewChannels);
17566 assert(NewOpcode != -1 &&
17567 NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
17568 "failed to find equivalent MIMG op");
17569
17570 // Adjust the writemask in the node
17572 llvm::append_range(Ops, Node->ops().take_front(DmaskIdx));
17573 Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
17574 llvm::append_range(Ops, Node->ops().drop_front(DmaskIdx + 1));
17575
17576 MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
17577
17578 MVT ResultVT = NewChannels == 1
17579 ? SVT
17580 : MVT::getVectorVT(SVT, NewChannels == 3 ? 4
17581 : NewChannels == 5 ? 8
17582 : NewChannels);
17583 SDVTList NewVTList =
17584 HasChain ? DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT);
17585
17586 MachineSDNode *NewNode =
17587 DAG.getMachineNode(NewOpcode, SDLoc(Node), NewVTList, Ops);
17588
17589 if (HasChain) {
17590 // Update chain.
17591 DAG.setNodeMemRefs(NewNode, Node->memoperands());
17592 DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
17593 }
17594
17595 if (NewChannels == 1) {
17596 assert(Node->hasNUsesOfValue(1, 0));
17597 SDNode *Copy =
17598 DAG.getMachineNode(TargetOpcode::COPY, SDLoc(Node),
17599 Users[Lane]->getValueType(0), SDValue(NewNode, 0));
17600 DAG.ReplaceAllUsesWith(Users[Lane], Copy);
17601 return nullptr;
17602 }
17603
17604 // Update the users of the node with the new indices
17605 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
17606 SDNode *User = Users[i];
17607 if (!User) {
17608 // Handle the special case of NoChannels. We set NewDmask to 1 above, but
17609 // Users[0] is still nullptr because channel 0 doesn't really have a use.
17610 if (i || !NoChannels)
17611 continue;
17612 } else {
17613 SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
17614 SDNode *NewUser = DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
17615 if (NewUser != User) {
17616 DAG.ReplaceAllUsesWith(SDValue(User, 0), SDValue(NewUser, 0));
17617 DAG.RemoveDeadNode(User);
17618 }
17619 }
17620
17621 switch (Idx) {
17622 default:
17623 break;
17624 case AMDGPU::sub0:
17625 Idx = AMDGPU::sub1;
17626 break;
17627 case AMDGPU::sub1:
17628 Idx = AMDGPU::sub2;
17629 break;
17630 case AMDGPU::sub2:
17631 Idx = AMDGPU::sub3;
17632 break;
17633 case AMDGPU::sub3:
17634 Idx = AMDGPU::sub4;
17635 break;
17636 }
17637 }
17638
17639 DAG.RemoveDeadNode(Node);
17640 return nullptr;
17641}
17642
17644 if (Op.getOpcode() == ISD::AssertZext)
17645 Op = Op.getOperand(0);
17646
17647 return isa<FrameIndexSDNode>(Op);
17648}
17649
17650/// Legalize target independent instructions (e.g. INSERT_SUBREG)
17651/// with frame index operands.
17652/// LLVM assumes that inputs are to these instructions are registers.
17653SDNode *
17655 SelectionDAG &DAG) const {
17656 if (Node->getOpcode() == ISD::CopyToReg) {
17657 RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
17658 SDValue SrcVal = Node->getOperand(2);
17659
17660 // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
17661 // to try understanding copies to physical registers.
17662 if (SrcVal.getValueType() == MVT::i1 && DestReg->getReg().isPhysical()) {
17663 SDLoc SL(Node);
17665 SDValue VReg = DAG.getRegister(
17666 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
17667
17668 SDNode *Glued = Node->getGluedNode();
17669 SDValue ToVReg = DAG.getCopyToReg(
17670 Node->getOperand(0), SL, VReg, SrcVal,
17671 SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
17672 SDValue ToResultReg = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0),
17673 VReg, ToVReg.getValue(1));
17674 DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode());
17675 DAG.RemoveDeadNode(Node);
17676 return ToResultReg.getNode();
17677 }
17678 }
17679
17681 for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
17682 if (!isFrameIndexOp(Node->getOperand(i))) {
17683 Ops.push_back(Node->getOperand(i));
17684 continue;
17685 }
17686
17687 SDLoc DL(Node);
17688 Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
17689 Node->getOperand(i).getValueType(),
17690 Node->getOperand(i)),
17691 0));
17692 }
17693
17694 return DAG.UpdateNodeOperands(Node, Ops);
17695}
17696
17697/// Fold the instructions after selecting them.
17698/// Returns null if users were already updated.
17700 SelectionDAG &DAG) const {
17702 unsigned Opcode = Node->getMachineOpcode();
17703
17704 if (TII->isImage(Opcode) && !TII->get(Opcode).mayStore() &&
17705 !TII->isGather4(Opcode) &&
17706 AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::dmask)) {
17707 return adjustWritemask(Node, DAG);
17708 }
17709
17710 if (Opcode == AMDGPU::INSERT_SUBREG || Opcode == AMDGPU::REG_SEQUENCE) {
17712 return Node;
17713 }
17714
17715 switch (Opcode) {
17716 case AMDGPU::V_DIV_SCALE_F32_e64:
17717 case AMDGPU::V_DIV_SCALE_F64_e64: {
17718 // Satisfy the operand register constraint when one of the inputs is
17719 // undefined. Ordinarily each undef value will have its own implicit_def of
17720 // a vreg, so force these to use a single register.
17721 SDValue Src0 = Node->getOperand(1);
17722 SDValue Src1 = Node->getOperand(3);
17723 SDValue Src2 = Node->getOperand(5);
17724
17725 if ((Src0.isMachineOpcode() &&
17726 Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
17727 (Src0 == Src1 || Src0 == Src2))
17728 break;
17729
17730 MVT VT = Src0.getValueType().getSimpleVT();
17731 const TargetRegisterClass *RC =
17732 getRegClassFor(VT, Src0.getNode()->isDivergent());
17733
17735 SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);
17736
17737 SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node), UndefReg,
17738 Src0, SDValue());
17739
17740 // src0 must be the same register as src1 or src2, even if the value is
17741 // undefined, so make sure we don't violate this constraint.
17742 if (Src0.isMachineOpcode() &&
17743 Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
17744 if (Src1.isMachineOpcode() &&
17745 Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
17746 Src0 = Src1;
17747 else if (Src2.isMachineOpcode() &&
17748 Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
17749 Src0 = Src2;
17750 else {
17751 assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
17752 Src0 = UndefReg;
17753 Src1 = UndefReg;
17754 }
17755 } else
17756 break;
17757
17759 Ops[1] = Src0;
17760 Ops[3] = Src1;
17761 Ops[5] = Src2;
17762 Ops.push_back(ImpDef.getValue(1));
17763 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
17764 }
17765 default:
17766 break;
17767 }
17768
17769 return Node;
17770}
17771
17772// Any MIMG instructions that use tfe or lwe require an initialization of the
17773// result register that will be written in the case of a memory access failure.
17774// The required code is also added to tie this init code to the result of the
17775// img instruction.
17778 const SIRegisterInfo &TRI = TII->getRegisterInfo();
17779 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
17780 MachineBasicBlock &MBB = *MI.getParent();
17781
17782 int DstIdx =
17783 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
17784 unsigned InitIdx = 0;
17785
17786 if (TII->isImage(MI)) {
17787 MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe);
17788 MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe);
17789 MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16);
17790
17791 if (!TFE && !LWE) // intersect_ray
17792 return;
17793
17794 unsigned TFEVal = TFE ? TFE->getImm() : 0;
17795 unsigned LWEVal = LWE ? LWE->getImm() : 0;
17796 unsigned D16Val = D16 ? D16->getImm() : 0;
17797
17798 if (!TFEVal && !LWEVal)
17799 return;
17800
17801 // At least one of TFE or LWE are non-zero
17802 // We have to insert a suitable initialization of the result value and
17803 // tie this to the dest of the image instruction.
17804
17805 // Calculate which dword we have to initialize to 0.
17806 MachineOperand *MO_Dmask = TII->getNamedOperand(MI, AMDGPU::OpName::dmask);
17807
17808 // check that dmask operand is found.
17809 assert(MO_Dmask && "Expected dmask operand in instruction");
17810
17811 unsigned dmask = MO_Dmask->getImm();
17812 // Determine the number of active lanes taking into account the
17813 // Gather4 special case
17814 unsigned ActiveLanes = TII->isGather4(MI) ? 4 : llvm::popcount(dmask);
17815
17816 bool Packed = !Subtarget->hasUnpackedD16VMem();
17817
17818 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
17819
17820 // Abandon attempt if the dst size isn't large enough
17821 // - this is in fact an error but this is picked up elsewhere and
17822 // reported correctly.
17823 const TargetRegisterClass *DstRC = TII->getRegClass(MI.getDesc(), DstIdx);
17824
17825 uint32_t DstSize = TRI.getRegSizeInBits(*DstRC) / 32;
17826 if (DstSize < InitIdx)
17827 return;
17828 } else if (TII->isMUBUF(MI) && AMDGPU::getMUBUFTfe(MI.getOpcode())) {
17829 const TargetRegisterClass *DstRC = TII->getRegClass(MI.getDesc(), DstIdx);
17830 InitIdx = TRI.getRegSizeInBits(*DstRC) / 32;
17831 } else {
17832 return;
17833 }
17834
17835 const DebugLoc &DL = MI.getDebugLoc();
17836
17837 // Create a register for the initialization value.
17838 Register PrevDst = MRI.cloneVirtualRegister(MI.getOperand(DstIdx).getReg());
17839 unsigned NewDst = 0; // Final initialized value will be in here
17840
17841 // If PRTStrictNull feature is enabled (the default) then initialize
17842 // all the result registers to 0, otherwise just the error indication
17843 // register (VGPRn+1)
17844 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
17845 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
17846
17847 BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst);
17848 for (; SizeLeft; SizeLeft--, CurrIdx++) {
17849 NewDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
17850 // Initialize dword
17851 Register SubReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
17852 // clang-format off
17853 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg)
17854 .addImm(0);
17855 // clang-format on
17856 // Insert into the super-reg
17857 BuildMI(MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst)
17858 .addReg(PrevDst)
17859 .addReg(SubReg)
17861
17862 PrevDst = NewDst;
17863 }
17864
17865 // Add as an implicit operand
17866 MI.addOperand(MachineOperand::CreateReg(NewDst, false, true));
17867
17868 // Tie the just added implicit operand to the dst
17869 MI.tieOperands(DstIdx, MI.getNumOperands() - 1);
17870}
17871
17872/// Assign the register class depending on the number of
17873/// bits set in the writemask
17875 SDNode *Node) const {
17877
17878 MachineFunction *MF = MI.getMF();
17880
17881 if (TII->isVOP3(MI.getOpcode())) {
17882 // Make sure constant bus requirements are respected.
17883 TII->legalizeOperandsVOP3(MRI, MI);
17884
17885 if (TII->isMAI(MI)) {
17886 // The ordinary src0, src1, src2 were legalized above.
17887 //
17888 // We have to also legalize the appended v_mfma_ld_scale_b32 operands,
17889 // as a separate instruction.
17890 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
17891 AMDGPU::OpName::scale_src0);
17892 if (Src0Idx != -1) {
17893 int Src1Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
17894 AMDGPU::OpName::scale_src1);
17895 if (TII->usesConstantBus(MRI, MI, Src0Idx) &&
17896 TII->usesConstantBus(MRI, MI, Src1Idx))
17897 TII->legalizeOpWithMove(MI, Src1Idx);
17898 }
17899 }
17900
17901 return;
17902 }
17903
17904 if (TII->isImage(MI))
17905 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::vaddr);
17906}
17907
17909 uint64_t Val) {
17910 SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
17911 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
17912}
17913
17915 const SDLoc &DL,
17916 SDValue Ptr) const {
17918
17919 // Build the half of the subregister with the constants before building the
17920 // full 128-bit register. If we are building multiple resource descriptors,
17921 // this will allow CSEing of the 2-component register.
17922 const SDValue Ops0[] = {
17923 DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
17924 buildSMovImm32(DAG, DL, 0),
17925 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
17926 buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
17927 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
17928
17929 SDValue SubRegHi = SDValue(
17930 DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v2i32, Ops0), 0);
17931
17932 // Combine the constants and the pointer.
17933 const SDValue Ops1[] = {
17934 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32), Ptr,
17935 DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32), SubRegHi,
17936 DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)};
17937
17938 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
17939}
17940
17941/// Return a resource descriptor with the 'Add TID' bit enabled
17942/// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
17943/// of the resource descriptor) to create an offset, which is added to
17944/// the resource pointer.
17946 SDValue Ptr, uint32_t RsrcDword1,
17947 uint64_t RsrcDword2And3) const {
17948 SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
17949 SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
17950 if (RsrcDword1) {
17951 PtrHi =
17952 SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
17953 DAG.getConstant(RsrcDword1, DL, MVT::i32)),
17954 0);
17955 }
17956
17957 SDValue DataLo =
17958 buildSMovImm32(DAG, DL, RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
17959 SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
17960
17961 const SDValue Ops[] = {
17962 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32),
17963 PtrLo,
17964 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
17965 PtrHi,
17966 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
17967 DataLo,
17968 DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
17969 DataHi,
17970 DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)};
17971
17972 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
17973}
17974
17975//===----------------------------------------------------------------------===//
17976// SI Inline Assembly Support
17977//===----------------------------------------------------------------------===//
17978
17979std::pair<unsigned, const TargetRegisterClass *>
17981 StringRef Constraint,
17982 MVT VT) const {
17983 const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(TRI_);
17984
17985 const TargetRegisterClass *RC = nullptr;
17986 if (Constraint.size() == 1) {
17987 // Check if we cannot determine the bit size of the given value type. This
17988 // can happen, for example, in this situation where we have an empty struct
17989 // (size 0): `call void asm "", "v"({} poison)`-
17990 if (VT == MVT::Other)
17991 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
17992 const unsigned BitWidth = VT.getSizeInBits();
17993 switch (Constraint[0]) {
17994 default:
17995 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
17996 case 's':
17997 case 'r':
17998 switch (BitWidth) {
17999 case 16:
18000 RC = &AMDGPU::SReg_32RegClass;
18001 break;
18002 case 64:
18003 RC = &AMDGPU::SGPR_64RegClass;
18004 break;
18005 default:
18007 if (!RC)
18008 return std::pair(0U, nullptr);
18009 break;
18010 }
18011 break;
18012 case 'v':
18013 switch (BitWidth) {
18014 case 1:
18015 return std::pair(0U, nullptr);
18016 case 16:
18017 RC = Subtarget->useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass
18018 : &AMDGPU::VGPR_32_Lo256RegClass;
18019 break;
18020 default:
18021 RC = Subtarget->has1024AddressableVGPRs()
18022 ? TRI->getAlignedLo256VGPRClassForBitWidth(BitWidth)
18023 : TRI->getVGPRClassForBitWidth(BitWidth);
18024 if (!RC)
18025 return std::pair(0U, nullptr);
18026 break;
18027 }
18028 break;
18029 case 'a':
18030 if (!Subtarget->hasMAIInsts())
18031 break;
18032 switch (BitWidth) {
18033 case 1:
18034 return std::pair(0U, nullptr);
18035 case 16:
18036 RC = &AMDGPU::AGPR_32RegClass;
18037 break;
18038 default:
18039 RC = TRI->getAGPRClassForBitWidth(BitWidth);
18040 if (!RC)
18041 return std::pair(0U, nullptr);
18042 break;
18043 }
18044 break;
18045 }
18046 } else if (Constraint == "VA" && Subtarget->hasGFX90AInsts()) {
18047 const unsigned BitWidth = VT.getSizeInBits();
18048 switch (BitWidth) {
18049 case 16:
18050 RC = &AMDGPU::AV_32RegClass;
18051 break;
18052 default:
18053 RC = TRI->getVectorSuperClassForBitWidth(BitWidth);
18054 if (!RC)
18055 return std::pair(0U, nullptr);
18056 break;
18057 }
18058 }
18059
18060 // We actually support i128, i16 and f16 as inline parameters
18061 // even if they are not reported as legal
18062 if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
18063 VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
18064 return std::pair(0U, RC);
18065
18066 auto [Kind, Idx, NumRegs] = AMDGPU::parseAsmConstraintPhysReg(Constraint);
18067 if (Kind != '\0') {
18068 if (Kind == 'v') {
18069 RC = &AMDGPU::VGPR_32_Lo256RegClass;
18070 } else if (Kind == 's') {
18071 RC = &AMDGPU::SGPR_32RegClass;
18072 } else if (Kind == 'a') {
18073 RC = &AMDGPU::AGPR_32RegClass;
18074 }
18075
18076 if (RC) {
18077 if (NumRegs > 1) {
18078 if (Idx >= RC->getNumRegs() || Idx + NumRegs - 1 >= RC->getNumRegs())
18079 return std::pair(0U, nullptr);
18080
18081 uint32_t Width = NumRegs * 32;
18082 // Prohibit constraints for register ranges with a width that does not
18083 // match the required type.
18084 if (VT.SimpleTy != MVT::Other && Width != VT.getSizeInBits())
18085 return std::pair(0U, nullptr);
18086
18087 MCRegister Reg = RC->getRegister(Idx);
18089 RC = TRI->getVGPRClassForBitWidth(Width);
18090 else if (SIRegisterInfo::isSGPRClass(RC))
18091 RC = TRI->getSGPRClassForBitWidth(Width);
18092 else if (SIRegisterInfo::isAGPRClass(RC))
18093 RC = TRI->getAGPRClassForBitWidth(Width);
18094 if (RC) {
18095 Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
18096 if (!Reg) {
18097 // The register class does not contain the requested register,
18098 // e.g., because it is an SGPR pair that would violate alignment
18099 // requirements.
18100 return std::pair(0U, nullptr);
18101 }
18102 return std::pair(Reg, RC);
18103 }
18104 }
18105
18106 // Check for lossy scalar/vector conversions.
18107 if (VT.isVector() && VT.getSizeInBits() != 32)
18108 return std::pair(0U, nullptr);
18109 if (Idx < RC->getNumRegs())
18110 return std::pair(RC->getRegister(Idx), RC);
18111 return std::pair(0U, nullptr);
18112 }
18113 }
18114
18115 auto Ret = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
18116 if (Ret.first)
18117 Ret.second = TRI->getPhysRegBaseClass(Ret.first);
18118
18119 return Ret;
18120}
18121
18122static bool isImmConstraint(StringRef Constraint) {
18123 if (Constraint.size() == 1) {
18124 switch (Constraint[0]) {
18125 default:
18126 break;
18127 case 'I':
18128 case 'J':
18129 case 'A':
18130 case 'B':
18131 case 'C':
18132 return true;
18133 }
18134 } else if (Constraint == "DA" || Constraint == "DB") {
18135 return true;
18136 }
18137 return false;
18138}
18139
18142 if (Constraint.size() == 1) {
18143 switch (Constraint[0]) {
18144 default:
18145 break;
18146 case 's':
18147 case 'v':
18148 case 'a':
18149 return C_RegisterClass;
18150 }
18151 } else if (Constraint.size() == 2) {
18152 if (Constraint == "VA")
18153 return C_RegisterClass;
18154 }
18155 if (isImmConstraint(Constraint)) {
18156 return C_Other;
18157 }
18158 return TargetLowering::getConstraintType(Constraint);
18159}
18160
18161static uint64_t clearUnusedBits(uint64_t Val, unsigned Size) {
18163 Val = Val & maskTrailingOnes<uint64_t>(Size);
18164 }
18165 return Val;
18166}
18167
18169 StringRef Constraint,
18170 std::vector<SDValue> &Ops,
18171 SelectionDAG &DAG) const {
18172 if (isImmConstraint(Constraint)) {
18173 uint64_t Val;
18174 if (getAsmOperandConstVal(Op, Val) &&
18175 checkAsmConstraintVal(Op, Constraint, Val)) {
18176 Val = clearUnusedBits(Val, Op.getScalarValueSizeInBits());
18177 Ops.push_back(DAG.getTargetConstant(Val, SDLoc(Op), MVT::i64));
18178 }
18179 } else {
18181 }
18182}
18183
18185 unsigned Size = Op.getScalarValueSizeInBits();
18186 if (Size > 64)
18187 return false;
18188
18189 if (Size == 16 && !Subtarget->has16BitInsts())
18190 return false;
18191
18193 Val = C->getSExtValue();
18194 return true;
18195 }
18197 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
18198 return true;
18199 }
18201 if (Size != 16 || Op.getNumOperands() != 2)
18202 return false;
18203 if (Op.getOperand(0).isUndef() || Op.getOperand(1).isUndef())
18204 return false;
18205 if (ConstantSDNode *C = V->getConstantSplatNode()) {
18206 Val = C->getSExtValue();
18207 return true;
18208 }
18209 if (ConstantFPSDNode *C = V->getConstantFPSplatNode()) {
18210 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
18211 return true;
18212 }
18213 }
18214
18215 return false;
18216}
18217
18219 uint64_t Val) const {
18220 if (Constraint.size() == 1) {
18221 switch (Constraint[0]) {
18222 case 'I':
18224 case 'J':
18225 return isInt<16>(Val);
18226 case 'A':
18227 return checkAsmConstraintValA(Op, Val);
18228 case 'B':
18229 return isInt<32>(Val);
18230 case 'C':
18231 return isUInt<32>(clearUnusedBits(Val, Op.getScalarValueSizeInBits())) ||
18233 default:
18234 break;
18235 }
18236 } else if (Constraint.size() == 2) {
18237 if (Constraint == "DA") {
18238 int64_t HiBits = static_cast<int32_t>(Val >> 32);
18239 int64_t LoBits = static_cast<int32_t>(Val);
18240 return checkAsmConstraintValA(Op, HiBits, 32) &&
18241 checkAsmConstraintValA(Op, LoBits, 32);
18242 }
18243 if (Constraint == "DB") {
18244 return true;
18245 }
18246 }
18247 llvm_unreachable("Invalid asm constraint");
18248}
18249
18251 unsigned MaxSize) const {
18252 unsigned Size = std::min<unsigned>(Op.getScalarValueSizeInBits(), MaxSize);
18253 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
18254 if (Size == 16) {
18255 MVT VT = Op.getSimpleValueType();
18256 switch (VT.SimpleTy) {
18257 default:
18258 return false;
18259 case MVT::i16:
18260 return AMDGPU::isInlinableLiteralI16(Val, HasInv2Pi);
18261 case MVT::f16:
18262 return AMDGPU::isInlinableLiteralFP16(Val, HasInv2Pi);
18263 case MVT::bf16:
18264 return AMDGPU::isInlinableLiteralBF16(Val, HasInv2Pi);
18265 case MVT::v2i16:
18266 return AMDGPU::getInlineEncodingV2I16(Val).has_value();
18267 case MVT::v2f16:
18268 return AMDGPU::getInlineEncodingV2F16(Val).has_value();
18269 case MVT::v2bf16:
18270 return AMDGPU::getInlineEncodingV2BF16(Val).has_value();
18271 }
18272 }
18273 if ((Size == 32 && AMDGPU::isInlinableLiteral32(Val, HasInv2Pi)) ||
18274 (Size == 64 && AMDGPU::isInlinableLiteral64(Val, HasInv2Pi)))
18275 return true;
18276 return false;
18277}
18278
18279static int getAlignedAGPRClassID(unsigned UnalignedClassID) {
18280 switch (UnalignedClassID) {
18281 case AMDGPU::VReg_64RegClassID:
18282 return AMDGPU::VReg_64_Align2RegClassID;
18283 case AMDGPU::VReg_96RegClassID:
18284 return AMDGPU::VReg_96_Align2RegClassID;
18285 case AMDGPU::VReg_128RegClassID:
18286 return AMDGPU::VReg_128_Align2RegClassID;
18287 case AMDGPU::VReg_160RegClassID:
18288 return AMDGPU::VReg_160_Align2RegClassID;
18289 case AMDGPU::VReg_192RegClassID:
18290 return AMDGPU::VReg_192_Align2RegClassID;
18291 case AMDGPU::VReg_224RegClassID:
18292 return AMDGPU::VReg_224_Align2RegClassID;
18293 case AMDGPU::VReg_256RegClassID:
18294 return AMDGPU::VReg_256_Align2RegClassID;
18295 case AMDGPU::VReg_288RegClassID:
18296 return AMDGPU::VReg_288_Align2RegClassID;
18297 case AMDGPU::VReg_320RegClassID:
18298 return AMDGPU::VReg_320_Align2RegClassID;
18299 case AMDGPU::VReg_352RegClassID:
18300 return AMDGPU::VReg_352_Align2RegClassID;
18301 case AMDGPU::VReg_384RegClassID:
18302 return AMDGPU::VReg_384_Align2RegClassID;
18303 case AMDGPU::VReg_512RegClassID:
18304 return AMDGPU::VReg_512_Align2RegClassID;
18305 case AMDGPU::VReg_1024RegClassID:
18306 return AMDGPU::VReg_1024_Align2RegClassID;
18307 case AMDGPU::AReg_64RegClassID:
18308 return AMDGPU::AReg_64_Align2RegClassID;
18309 case AMDGPU::AReg_96RegClassID:
18310 return AMDGPU::AReg_96_Align2RegClassID;
18311 case AMDGPU::AReg_128RegClassID:
18312 return AMDGPU::AReg_128_Align2RegClassID;
18313 case AMDGPU::AReg_160RegClassID:
18314 return AMDGPU::AReg_160_Align2RegClassID;
18315 case AMDGPU::AReg_192RegClassID:
18316 return AMDGPU::AReg_192_Align2RegClassID;
18317 case AMDGPU::AReg_256RegClassID:
18318 return AMDGPU::AReg_256_Align2RegClassID;
18319 case AMDGPU::AReg_512RegClassID:
18320 return AMDGPU::AReg_512_Align2RegClassID;
18321 case AMDGPU::AReg_1024RegClassID:
18322 return AMDGPU::AReg_1024_Align2RegClassID;
18323 default:
18324 return -1;
18325 }
18326}
18327
18328// Figure out which registers should be reserved for stack access. Only after
18329// the function is legalized do we know all of the non-spill stack objects or if
18330// calls are present.
18334 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
18335 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
18336 const SIInstrInfo *TII = ST.getInstrInfo();
18337
18338 if (Info->isEntryFunction()) {
18339 // Callable functions have fixed registers used for stack access.
18341 }
18342
18343 // TODO: Move this logic to getReservedRegs()
18344 // Reserve the SGPR(s) to save/restore EXEC for WWM spill/copy handling.
18345 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
18346 Register SReg = ST.isWave32()
18347 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
18348 : TRI->getAlignedHighSGPRForRC(MF, /*Align=*/2,
18349 &AMDGPU::SGPR_64RegClass);
18350 Info->setSGPRForEXECCopy(SReg);
18351
18352 assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
18353 Info->getStackPtrOffsetReg()));
18354 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
18355 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
18356
18357 // We need to worry about replacing the default register with itself in case
18358 // of MIR testcases missing the MFI.
18359 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
18360 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
18361
18362 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
18363 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
18364
18365 Info->limitOccupancy(MF);
18366
18367 if (ST.isWave32() && !MF.empty()) {
18368 for (auto &MBB : MF) {
18369 for (auto &MI : MBB) {
18370 TII->fixImplicitOperands(MI);
18371 }
18372 }
18373 }
18374
18375 // FIXME: This is a hack to fixup AGPR classes to use the properly aligned
18376 // classes if required. Ideally the register class constraints would differ
18377 // per-subtarget, but there's no easy way to achieve that right now. This is
18378 // not a problem for VGPRs because the correctly aligned VGPR class is implied
18379 // from using them as the register class for legal types.
18380 if (ST.needsAlignedVGPRs()) {
18381 for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
18382 const Register Reg = Register::index2VirtReg(I);
18383 const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
18384 if (!RC)
18385 continue;
18386 int NewClassID = getAlignedAGPRClassID(RC->getID());
18387 if (NewClassID != -1)
18388 MRI.setRegClass(Reg, TRI->getRegClass(NewClassID));
18389 }
18390 }
18391
18393}
18394
18396 KnownBits &Known,
18397 const APInt &DemandedElts,
18398 const SelectionDAG &DAG,
18399 unsigned Depth) const {
18400 Known.resetAll();
18401 unsigned Opc = Op.getOpcode();
18402 switch (Opc) {
18404 unsigned IID = Op.getConstantOperandVal(0);
18405 switch (IID) {
18406 case Intrinsic::amdgcn_mbcnt_lo:
18407 case Intrinsic::amdgcn_mbcnt_hi: {
18408 const GCNSubtarget &ST =
18410 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
18411 // most 31 + src1.
18412 Known.Zero.setBitsFrom(
18413 IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);
18414 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
18415 Known = KnownBits::add(Known, Known2);
18416 return;
18417 }
18418 }
18419 break;
18420 }
18421 }
18423 Op, Known, DemandedElts, DAG, Depth);
18424}
18425
18427 const int FI, KnownBits &Known, const MachineFunction &MF) const {
18429
18430 // Set the high bits to zero based on the maximum allowed scratch size per
18431 // wave. We can't use vaddr in MUBUF instructions if we don't know the address
18432 // calculation won't overflow, so assume the sign bit is never set.
18433 Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
18434}
18435
18437 GISelValueTracking &VT, KnownBits &Known,
18438 unsigned Dim) {
18439 unsigned MaxValue =
18440 ST.getMaxWorkitemID(VT.getMachineFunction().getFunction(), Dim);
18441 Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
18442}
18443
18445 KnownBits &Known, const APInt &DemandedElts,
18446 unsigned BFEWidth, bool SExt, unsigned Depth) {
18448 const MachineOperand &Src1 = MI.getOperand(2);
18449
18450 unsigned Src1Cst = 0;
18451 if (Src1.isImm()) {
18452 Src1Cst = Src1.getImm();
18453 } else if (Src1.isReg()) {
18454 auto Cst = getIConstantVRegValWithLookThrough(Src1.getReg(), MRI);
18455 if (!Cst)
18456 return;
18457 Src1Cst = Cst->Value.getZExtValue();
18458 } else {
18459 return;
18460 }
18461
18462 // Offset is at bits [4:0] for 32 bit, [5:0] for 64 bit.
18463 // Width is always [22:16].
18464 const unsigned Offset =
18465 Src1Cst & maskTrailingOnes<unsigned>((BFEWidth == 32) ? 5 : 6);
18466 const unsigned Width = (Src1Cst >> 16) & maskTrailingOnes<unsigned>(6);
18467
18468 if (Width >= BFEWidth) // Ill-formed.
18469 return;
18470
18471 VT.computeKnownBitsImpl(MI.getOperand(1).getReg(), Known, DemandedElts,
18472 Depth + 1);
18473
18474 Known = Known.extractBits(Width, Offset);
18475
18476 if (SExt)
18477 Known = Known.sext(BFEWidth);
18478 else
18479 Known = Known.zext(BFEWidth);
18480}
18481
18483 GISelValueTracking &VT, Register R, KnownBits &Known,
18484 const APInt &DemandedElts, const MachineRegisterInfo &MRI,
18485 unsigned Depth) const {
18486 Known.resetAll();
18487 const MachineInstr *MI = MRI.getVRegDef(R);
18488 switch (MI->getOpcode()) {
18489 case AMDGPU::S_BFE_I32:
18490 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/32,
18491 /*SExt=*/true, Depth);
18492 case AMDGPU::S_BFE_U32:
18493 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/32,
18494 /*SExt=*/false, Depth);
18495 case AMDGPU::S_BFE_I64:
18496 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/64,
18497 /*SExt=*/true, Depth);
18498 case AMDGPU::S_BFE_U64:
18499 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/64,
18500 /*SExt=*/false, Depth);
18501 case AMDGPU::G_INTRINSIC:
18502 case AMDGPU::G_INTRINSIC_CONVERGENT: {
18503 Intrinsic::ID IID = cast<GIntrinsic>(MI)->getIntrinsicID();
18504 switch (IID) {
18505 case Intrinsic::amdgcn_workitem_id_x:
18506 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 0);
18507 break;
18508 case Intrinsic::amdgcn_workitem_id_y:
18509 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 1);
18510 break;
18511 case Intrinsic::amdgcn_workitem_id_z:
18512 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 2);
18513 break;
18514 case Intrinsic::amdgcn_mbcnt_lo:
18515 case Intrinsic::amdgcn_mbcnt_hi: {
18516 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
18517 // most 31 + src1.
18518 Known.Zero.setBitsFrom(IID == Intrinsic::amdgcn_mbcnt_lo
18519 ? getSubtarget()->getWavefrontSizeLog2()
18520 : 5);
18521 KnownBits Known2;
18522 VT.computeKnownBitsImpl(MI->getOperand(3).getReg(), Known2, DemandedElts,
18523 Depth + 1);
18524 Known = KnownBits::add(Known, Known2);
18525 break;
18526 }
18527 case Intrinsic::amdgcn_groupstaticsize: {
18528 // We can report everything over the maximum size as 0. We can't report
18529 // based on the actual size because we don't know if it's accurate or not
18530 // at any given point.
18531 Known.Zero.setHighBits(
18532 llvm::countl_zero(getSubtarget()->getAddressableLocalMemorySize()));
18533 break;
18534 }
18535 }
18536 break;
18537 }
18538 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
18539 Known.Zero.setHighBits(24);
18540 break;
18541 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
18542 Known.Zero.setHighBits(16);
18543 break;
18544 case AMDGPU::G_AMDGPU_SMED3:
18545 case AMDGPU::G_AMDGPU_UMED3: {
18546 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
18547
18548 KnownBits Known2;
18549 VT.computeKnownBitsImpl(Src2, Known2, DemandedElts, Depth + 1);
18550 if (Known2.isUnknown())
18551 break;
18552
18553 KnownBits Known1;
18554 VT.computeKnownBitsImpl(Src1, Known1, DemandedElts, Depth + 1);
18555 if (Known1.isUnknown())
18556 break;
18557
18558 KnownBits Known0;
18559 VT.computeKnownBitsImpl(Src0, Known0, DemandedElts, Depth + 1);
18560 if (Known0.isUnknown())
18561 break;
18562
18563 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
18564 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
18565 Known.One = Known0.One & Known1.One & Known2.One;
18566 break;
18567 }
18568 }
18569}
18570
18573 unsigned Depth) const {
18574 const MachineInstr *MI = MRI.getVRegDef(R);
18575 if (auto *GI = dyn_cast<GIntrinsic>(MI)) {
18576 // FIXME: Can this move to generic code? What about the case where the call
18577 // site specifies a lower alignment?
18578 Intrinsic::ID IID = GI->getIntrinsicID();
18580 AttributeList Attrs =
18581 Intrinsic::getAttributes(Ctx, IID, Intrinsic::getType(Ctx, IID));
18582 if (MaybeAlign RetAlign = Attrs.getRetAlignment())
18583 return *RetAlign;
18584 }
18585 return Align(1);
18586}
18587
18590 const Align CacheLineAlign = Align(64);
18591
18592 // Pre-GFX10 target did not benefit from loop alignment
18593 if (!ML || DisableLoopAlignment || !getSubtarget()->hasInstPrefetch() ||
18594 getSubtarget()->hasInstFwdPrefetchBug())
18595 return PrefAlign;
18596
18597 // On GFX10 I$ is 4 x 64 bytes cache lines.
18598 // By default prefetcher keeps one cache line behind and reads two ahead.
18599 // We can modify it with S_INST_PREFETCH for larger loops to have two lines
18600 // behind and one ahead.
18601 // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
18602 // If loop fits 64 bytes it always spans no more than two cache lines and
18603 // does not need an alignment.
18604 // Else if loop is less or equal 128 bytes we do not need to modify prefetch,
18605 // Else if loop is less or equal 192 bytes we need two lines behind.
18606
18608 const MachineBasicBlock *Header = ML->getHeader();
18609 if (Header->getAlignment() != PrefAlign)
18610 return Header->getAlignment(); // Already processed.
18611
18612 unsigned LoopSize = 0;
18613 for (const MachineBasicBlock *MBB : ML->blocks()) {
18614 // If inner loop block is aligned assume in average half of the alignment
18615 // size to be added as nops.
18616 if (MBB != Header)
18617 LoopSize += MBB->getAlignment().value() / 2;
18618
18619 for (const MachineInstr &MI : *MBB) {
18620 LoopSize += TII->getInstSizeInBytes(MI);
18621 if (LoopSize > 192)
18622 return PrefAlign;
18623 }
18624 }
18625
18626 if (LoopSize <= 64)
18627 return PrefAlign;
18628
18629 if (LoopSize <= 128)
18630 return CacheLineAlign;
18631
18632 // If any of parent loops is surrounded by prefetch instructions do not
18633 // insert new for inner loop, which would reset parent's settings.
18634 for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) {
18635 if (MachineBasicBlock *Exit = P->getExitBlock()) {
18636 auto I = Exit->getFirstNonDebugInstr();
18637 if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH)
18638 return CacheLineAlign;
18639 }
18640 }
18641
18642 MachineBasicBlock *Pre = ML->getLoopPreheader();
18643 MachineBasicBlock *Exit = ML->getExitBlock();
18644
18645 if (Pre && Exit) {
18646 auto PreTerm = Pre->getFirstTerminator();
18647 if (PreTerm == Pre->begin() ||
18648 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
18649 BuildMI(*Pre, PreTerm, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
18650 .addImm(1); // prefetch 2 lines behind PC
18651
18652 auto ExitHead = Exit->getFirstNonDebugInstr();
18653 if (ExitHead == Exit->end() ||
18654 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
18655 BuildMI(*Exit, ExitHead, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
18656 .addImm(2); // prefetch 1 line behind PC
18657 }
18658
18659 return CacheLineAlign;
18660}
18661
18662[[maybe_unused]]
18663static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
18664 assert(N->getOpcode() == ISD::CopyFromReg);
18665 do {
18666 // Follow the chain until we find an INLINEASM node.
18667 N = N->getOperand(0).getNode();
18668 if (N->getOpcode() == ISD::INLINEASM || N->getOpcode() == ISD::INLINEASM_BR)
18669 return true;
18670 } while (N->getOpcode() == ISD::CopyFromReg);
18671 return false;
18672}
18673
18676 UniformityInfo *UA) const {
18677 switch (N->getOpcode()) {
18678 case ISD::CopyFromReg: {
18679 const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1));
18680 const MachineRegisterInfo &MRI = FLI->MF->getRegInfo();
18681 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
18682 Register Reg = R->getReg();
18683
18684 // FIXME: Why does this need to consider isLiveIn?
18685 if (Reg.isPhysical() || MRI.isLiveIn(Reg))
18686 return !TRI->isSGPRReg(MRI, Reg);
18687
18688 if (const Value *V = FLI->getValueFromVirtualReg(R->getReg()))
18689 return UA->isDivergent(V);
18690
18692 return !TRI->isSGPRReg(MRI, Reg);
18693 }
18694 case ISD::LOAD: {
18695 const LoadSDNode *L = cast<LoadSDNode>(N);
18696 unsigned AS = L->getAddressSpace();
18697 // A flat load may access private memory.
18699 }
18700 case ISD::CALLSEQ_END:
18701 return true;
18703 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(0));
18705 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(1));
18706 case AMDGPUISD::ATOMIC_CMP_SWAP:
18707 case AMDGPUISD::BUFFER_ATOMIC_SWAP:
18708 case AMDGPUISD::BUFFER_ATOMIC_ADD:
18709 case AMDGPUISD::BUFFER_ATOMIC_SUB:
18710 case AMDGPUISD::BUFFER_ATOMIC_SMIN:
18711 case AMDGPUISD::BUFFER_ATOMIC_UMIN:
18712 case AMDGPUISD::BUFFER_ATOMIC_SMAX:
18713 case AMDGPUISD::BUFFER_ATOMIC_UMAX:
18714 case AMDGPUISD::BUFFER_ATOMIC_AND:
18715 case AMDGPUISD::BUFFER_ATOMIC_OR:
18716 case AMDGPUISD::BUFFER_ATOMIC_XOR:
18717 case AMDGPUISD::BUFFER_ATOMIC_INC:
18718 case AMDGPUISD::BUFFER_ATOMIC_DEC:
18719 case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP:
18720 case AMDGPUISD::BUFFER_ATOMIC_FADD:
18721 case AMDGPUISD::BUFFER_ATOMIC_FMIN:
18722 case AMDGPUISD::BUFFER_ATOMIC_FMAX:
18723 // Target-specific read-modify-write atomics are sources of divergence.
18724 return true;
18725 default:
18726 if (auto *A = dyn_cast<AtomicSDNode>(N)) {
18727 // Generic read-modify-write atomics are sources of divergence.
18728 return A->readMem() && A->writeMem();
18729 }
18730 return false;
18731 }
18732}
18733
18735 EVT VT) const {
18736 switch (VT.getScalarType().getSimpleVT().SimpleTy) {
18737 case MVT::f32:
18739 case MVT::f64:
18740 case MVT::f16:
18742 default:
18743 return false;
18744 }
18745}
18746
18748 LLT Ty, const MachineFunction &MF) const {
18749 switch (Ty.getScalarSizeInBits()) {
18750 case 32:
18751 return !denormalModeIsFlushAllF32(MF);
18752 case 64:
18753 case 16:
18754 return !denormalModeIsFlushAllF64F16(MF);
18755 default:
18756 return false;
18757 }
18758}
18759
18761 const APInt &DemandedElts,
18762 const SelectionDAG &DAG,
18763 bool SNaN,
18764 unsigned Depth) const {
18765 if (Op.getOpcode() == AMDGPUISD::CLAMP) {
18766 const MachineFunction &MF = DAG.getMachineFunction();
18768
18769 if (Info->getMode().DX10Clamp)
18770 return true; // Clamped to 0.
18771 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
18772 }
18773
18775 DAG, SNaN, Depth);
18776}
18777
18778// On older subtargets, global FP atomic instructions have a hardcoded FP mode
18779// and do not support FP32 denormals, and only support v2f16/f64 denormals.
18781 if (RMW->hasMetadata("amdgpu.ignore.denormal.mode"))
18782 return true;
18783
18784 const fltSemantics &Flt = RMW->getType()->getScalarType()->getFltSemantics();
18785 auto DenormMode = RMW->getFunction()->getDenormalMode(Flt);
18786 if (DenormMode == DenormalMode::getPreserveSign())
18787 return true;
18788
18789 // TODO: Remove this.
18790 return RMW->getFunction()
18791 ->getFnAttribute("amdgpu-unsafe-fp-atomics")
18792 .getValueAsBool();
18793}
18794
18796 LLVMContext &Ctx = RMW->getContext();
18797 StringRef MemScope =
18798 Ctx.getSyncScopeName(RMW->getSyncScopeID()).value_or("system");
18799
18800 return OptimizationRemark(DEBUG_TYPE, "Passed", RMW)
18801 << "Hardware instruction generated for atomic "
18802 << RMW->getOperationName(RMW->getOperation())
18803 << " operation at memory scope " << MemScope;
18804}
18805
18806static bool isV2F16OrV2BF16(Type *Ty) {
18807 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
18808 Type *EltTy = VT->getElementType();
18809 return VT->getNumElements() == 2 &&
18810 (EltTy->isHalfTy() || EltTy->isBFloatTy());
18811 }
18812
18813 return false;
18814}
18815
18816static bool isV2F16(Type *Ty) {
18818 return VT && VT->getNumElements() == 2 && VT->getElementType()->isHalfTy();
18819}
18820
18821static bool isV2BF16(Type *Ty) {
18823 return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy();
18824}
18825
18826/// \return true if atomicrmw integer ops work for the type.
18827static bool isAtomicRMWLegalIntTy(Type *Ty) {
18828 if (auto *IT = dyn_cast<IntegerType>(Ty)) {
18829 unsigned BW = IT->getBitWidth();
18830 return BW == 32 || BW == 64;
18831 }
18832
18833 return false;
18834}
18835
18836/// \return true if this atomicrmw xchg type can be selected.
18837static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW) {
18838 Type *Ty = RMW->getType();
18839 if (isAtomicRMWLegalIntTy(Ty))
18840 return true;
18841
18842 if (PointerType *PT = dyn_cast<PointerType>(Ty)) {
18843 const DataLayout &DL = RMW->getFunction()->getParent()->getDataLayout();
18844 unsigned BW = DL.getPointerSizeInBits(PT->getAddressSpace());
18845 return BW == 32 || BW == 64;
18846 }
18847
18848 if (Ty->isFloatTy() || Ty->isDoubleTy())
18849 return true;
18850
18852 return VT->getNumElements() == 2 &&
18853 VT->getElementType()->getPrimitiveSizeInBits() == 16;
18854 }
18855
18856 return false;
18857}
18858
18859/// \returns true if it's valid to emit a native instruction for \p RMW, based
18860/// on the properties of the target memory.
18861static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget,
18862 const AtomicRMWInst *RMW,
18863 bool HasSystemScope) {
18864 // The remote/fine-grained access logic is different from the integer
18865 // atomics. Without AgentScopeFineGrainedRemoteMemoryAtomics support,
18866 // fine-grained access does not work, even for a device local allocation.
18867 //
18868 // With AgentScopeFineGrainedRemoteMemoryAtomics, system scoped device local
18869 // allocations work.
18870 if (HasSystemScope) {
18871 if (Subtarget.hasAgentScopeFineGrainedRemoteMemoryAtomics() &&
18872 RMW->hasMetadata("amdgpu.no.remote.memory"))
18873 return true;
18874 if (Subtarget.hasEmulatedSystemScopeAtomics())
18875 return true;
18876 } else if (Subtarget.hasAgentScopeFineGrainedRemoteMemoryAtomics())
18877 return true;
18878
18879 return RMW->hasMetadata("amdgpu.no.fine.grained.memory");
18880}
18881
18882/// \return Action to perform on AtomicRMWInsts for integer operations.
18889
18890/// Return if a flat address space atomicrmw can access private memory.
18892 const MDNode *MD = I->getMetadata(LLVMContext::MD_noalias_addrspace);
18893 return !MD ||
18895}
18896
18899 // For GAS, lower to flat atomic.
18900 return STI.hasGloballyAddressableScratch()
18903}
18904
18907 unsigned AS = RMW->getPointerAddressSpace();
18908 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
18910
18911 // 64-bit flat atomics that dynamically reside in private memory will silently
18912 // be dropped.
18913 //
18914 // Note that we will emit a new copy of the original atomic in the expansion,
18915 // which will be incrementally relegalized.
18916 const DataLayout &DL = RMW->getFunction()->getDataLayout();
18917 if (AS == AMDGPUAS::FLAT_ADDRESS &&
18918 DL.getTypeSizeInBits(RMW->getType()) == 64 &&
18921
18922 auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
18924 ORE.emit([=]() {
18925 return emitAtomicRMWLegalRemark(RMW) << " due to an unsafe request.";
18926 });
18927 return Kind;
18928 };
18929
18930 auto SSID = RMW->getSyncScopeID();
18931 bool HasSystemScope =
18932 SSID == SyncScope::System ||
18933 SSID == RMW->getContext().getOrInsertSyncScopeID("one-as");
18934
18935 auto Op = RMW->getOperation();
18936 switch (Op) {
18938 // PCIe supports add and xchg for system atomics.
18939 return isAtomicRMWLegalXChgTy(RMW)
18942 case AtomicRMWInst::Add:
18943 // PCIe supports add and xchg for system atomics.
18945 case AtomicRMWInst::Sub:
18946 case AtomicRMWInst::And:
18947 case AtomicRMWInst::Or:
18948 case AtomicRMWInst::Xor:
18949 case AtomicRMWInst::Max:
18950 case AtomicRMWInst::Min:
18957 if (Op == AtomicRMWInst::USubCond && !Subtarget->hasCondSubInsts())
18959 if (Op == AtomicRMWInst::USubSat && !Subtarget->hasSubClampInsts())
18962 auto *IT = dyn_cast<IntegerType>(RMW->getType());
18963 if (!IT || IT->getBitWidth() != 32)
18965 }
18966
18969 if (Subtarget->hasEmulatedSystemScopeAtomics())
18971
18972 // On most subtargets, for atomicrmw operations other than add/xchg,
18973 // whether or not the instructions will behave correctly depends on where
18974 // the address physically resides and what interconnect is used in the
18975 // system configuration. On some some targets the instruction will nop,
18976 // and in others synchronization will only occur at degraded device scope.
18977 //
18978 // If the allocation is known local to the device, the instructions should
18979 // work correctly.
18980 if (RMW->hasMetadata("amdgpu.no.remote.memory"))
18982
18983 // If fine-grained remote memory works at device scope, we don't need to
18984 // do anything.
18985 if (!HasSystemScope &&
18986 Subtarget->hasAgentScopeFineGrainedRemoteMemoryAtomics())
18988
18989 // If we are targeting a remote allocated address, it depends what kind of
18990 // allocation the address belongs to.
18991 //
18992 // If the allocation is fine-grained (in host memory, or in PCIe peer
18993 // device memory), the operation will fail depending on the target.
18994 //
18995 // Note fine-grained host memory access does work on APUs or if XGMI is
18996 // used, but we do not know if we are targeting an APU or the system
18997 // configuration from the ISA version/target-cpu.
18998 if (RMW->hasMetadata("amdgpu.no.fine.grained.memory"))
19000
19003 // Atomic sub/or/xor do not work over PCI express, but atomic add
19004 // does. InstCombine transforms these with 0 to or, so undo that.
19005 if (const Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand());
19006 ConstVal && ConstVal->isNullValue())
19008 }
19009
19010 // If the allocation could be in remote, fine-grained memory, the rmw
19011 // instructions may fail. cmpxchg should work, so emit that. On some
19012 // system configurations, PCIe atomics aren't supported so cmpxchg won't
19013 // even work, so you're out of luck anyway.
19014
19015 // In summary:
19016 //
19017 // Cases that may fail:
19018 // - fine-grained pinned host memory
19019 // - fine-grained migratable host memory
19020 // - fine-grained PCIe peer device
19021 //
19022 // Cases that should work, but may be treated overly conservatively.
19023 // - fine-grained host memory on an APU
19024 // - fine-grained XGMI peer device
19026 }
19027
19029 }
19030 case AtomicRMWInst::FAdd: {
19031 Type *Ty = RMW->getType();
19032
19033 // TODO: Handle REGION_ADDRESS
19034 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
19035 // DS F32 FP atomics do respect the denormal mode, but the rounding mode
19036 // is fixed to round-to-nearest-even.
19037 //
19038 // F64 / PK_F16 / PK_BF16 never flush and are also fixed to
19039 // round-to-nearest-even.
19040 //
19041 // We ignore the rounding mode problem, even in strictfp. The C++ standard
19042 // suggests it is OK if the floating-point mode may not match the calling
19043 // thread.
19044 if (Ty->isFloatTy()) {
19045 return Subtarget->hasLDSFPAtomicAddF32() ? AtomicExpansionKind::None
19047 }
19048
19049 if (Ty->isDoubleTy()) {
19050 // Ignores denormal mode, but we don't consider flushing mandatory.
19051 return Subtarget->hasLDSFPAtomicAddF64() ? AtomicExpansionKind::None
19053 }
19054
19055 if (Subtarget->hasAtomicDsPkAdd16Insts() && isV2F16OrV2BF16(Ty))
19057
19059 }
19060
19061 // LDS atomics respect the denormal mode from the mode register.
19062 //
19063 // Traditionally f32 global/buffer memory atomics would unconditionally
19064 // flush denormals, but newer targets do not flush. f64/f16/bf16 cases never
19065 // flush.
19066 //
19067 // On targets with flat atomic fadd, denormals would flush depending on
19068 // whether the target address resides in LDS or global memory. We consider
19069 // this flat-maybe-flush as will-flush.
19070 if (Ty->isFloatTy() &&
19071 !Subtarget->hasMemoryAtomicFaddF32DenormalSupport() &&
19074
19075 // FIXME: These ReportUnsafeHWInsts are imprecise. Some of these cases are
19076 // safe. The message phrasing also should be better.
19077 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
19078 if (AS == AMDGPUAS::FLAT_ADDRESS) {
19079 // gfx942, gfx12
19080 if (Subtarget->hasAtomicFlatPkAdd16Insts() && isV2F16OrV2BF16(Ty))
19081 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19082 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
19083 // gfx90a, gfx942, gfx12
19084 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
19085 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19086
19087 // gfx942, gfx12
19088 if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isV2BF16(Ty))
19089 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19090 } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
19091 // gfx90a, gfx942, gfx12
19092 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
19093 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19094
19095 // While gfx90a/gfx942 supports v2bf16 for global/flat, it does not for
19096 // buffer. gfx12 does have the buffer version.
19097 if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isV2BF16(Ty))
19098 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19099 }
19100
19101 // global and flat atomic fadd f64: gfx90a, gfx942.
19102 if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
19103 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19104
19105 if (AS != AMDGPUAS::FLAT_ADDRESS) {
19106 if (Ty->isFloatTy()) {
19107 // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx942,
19108 // gfx11+.
19109 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
19110 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19111 // global/buffer atomic fadd f32 rtn: gfx90a, gfx942, gfx11+.
19112 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
19113 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19114 } else {
19115 // gfx908
19116 if (RMW->use_empty() &&
19117 Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() &&
19118 isV2F16(Ty))
19119 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19120 }
19121 }
19122
19123 // flat atomic fadd f32: gfx942, gfx11+.
19124 if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
19125 if (Subtarget->hasFlatAtomicFaddF32Inst())
19126 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19127
19128 // If it is in flat address space, and the type is float, we will try to
19129 // expand it, if the target supports global and lds atomic fadd. The
19130 // reason we need that is, in the expansion, we emit the check of
19131 // address space. If it is in global address space, we emit the global
19132 // atomic fadd; if it is in shared address space, we emit the LDS atomic
19133 // fadd.
19134 if (Subtarget->hasLDSFPAtomicAddF32()) {
19135 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
19137 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
19139 }
19140 }
19141 }
19142
19144 }
19146 case AtomicRMWInst::FMax: {
19147 Type *Ty = RMW->getType();
19148
19149 // LDS float and double fmin/fmax were always supported.
19150 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
19151 return Ty->isFloatTy() || Ty->isDoubleTy() ? AtomicExpansionKind::None
19153 }
19154
19155 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
19156 // For flat and global cases:
19157 // float, double in gfx7. Manual claims denormal support.
19158 // Removed in gfx8.
19159 // float, double restored in gfx10.
19160 // double removed again in gfx11, so only f32 for gfx11/gfx12.
19161 //
19162 // For gfx9, gfx90a and gfx942 support f64 for global (same as fadd), but
19163 // no f32.
19164 if (AS == AMDGPUAS::FLAT_ADDRESS) {
19165 if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
19166 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19167 if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
19168 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19169 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS) ||
19171 if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
19172 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19173 if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
19174 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19175 }
19176 }
19177
19179 }
19182 default:
19184 }
19185
19186 llvm_unreachable("covered atomicrmw op switch");
19187}
19188
19195
19202
19205 const AtomicCmpXchgInst *CmpX) const {
19206 unsigned AddrSpace = CmpX->getPointerAddressSpace();
19207 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
19209
19210 if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(CmpX))
19212
19213 const DataLayout &DL = CmpX->getDataLayout();
19214
19215 Type *ValTy = CmpX->getNewValOperand()->getType();
19216
19217 // If a 64-bit flat atomic may alias private, we need to avoid using the
19218 // atomic in the private case.
19219 return DL.getTypeSizeInBits(ValTy) == 64 ? AtomicExpansionKind::CustomExpand
19221}
19222
19223const TargetRegisterClass *
19224SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
19226 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
19227 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
19228 return Subtarget->isWave64() ? &AMDGPU::SReg_64RegClass
19229 : &AMDGPU::SReg_32RegClass;
19230 if (!TRI->isSGPRClass(RC) && !isDivergent)
19231 return TRI->getEquivalentSGPRClass(RC);
19232 if (TRI->isSGPRClass(RC) && isDivergent) {
19233 if (Subtarget->hasGFX90AInsts())
19234 return TRI->getEquivalentAVClass(RC);
19235 return TRI->getEquivalentVGPRClass(RC);
19236 }
19237
19238 return RC;
19239}
19240
19241// FIXME: This is a workaround for DivergenceAnalysis not understanding always
19242// uniform values (as produced by the mask results of control flow intrinsics)
19243// used outside of divergent blocks. The phi users need to also be treated as
19244// always uniform.
19245//
19246// FIXME: DA is no longer in-use. Does this still apply to UniformityAnalysis?
19247static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited,
19248 unsigned WaveSize) {
19249 // FIXME: We assume we never cast the mask results of a control flow
19250 // intrinsic.
19251 // Early exit if the type won't be consistent as a compile time hack.
19252 IntegerType *IT = dyn_cast<IntegerType>(V->getType());
19253 if (!IT || IT->getBitWidth() != WaveSize)
19254 return false;
19255
19256 if (!isa<Instruction>(V))
19257 return false;
19258 if (!Visited.insert(V).second)
19259 return false;
19260 bool Result = false;
19261 for (const auto *U : V->users()) {
19263 if (V == U->getOperand(1)) {
19264 switch (Intrinsic->getIntrinsicID()) {
19265 default:
19266 Result = false;
19267 break;
19268 case Intrinsic::amdgcn_if_break:
19269 case Intrinsic::amdgcn_if:
19270 case Intrinsic::amdgcn_else:
19271 Result = true;
19272 break;
19273 }
19274 }
19275 if (V == U->getOperand(0)) {
19276 switch (Intrinsic->getIntrinsicID()) {
19277 default:
19278 Result = false;
19279 break;
19280 case Intrinsic::amdgcn_end_cf:
19281 case Intrinsic::amdgcn_loop:
19282 Result = true;
19283 break;
19284 }
19285 }
19286 } else {
19287 Result = hasCFUser(U, Visited, WaveSize);
19288 }
19289 if (Result)
19290 break;
19291 }
19292 return Result;
19293}
19294
19296 const Value *V) const {
19297 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
19298 if (CI->isInlineAsm()) {
19299 // FIXME: This cannot give a correct answer. This should only trigger in
19300 // the case where inline asm returns mixed SGPR and VGPR results, used
19301 // outside the defining block. We don't have a specific result to
19302 // consider, so this assumes if any value is SGPR, the overall register
19303 // also needs to be SGPR.
19304 const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
19306 MF.getDataLayout(), Subtarget->getRegisterInfo(), *CI);
19307 for (auto &TC : TargetConstraints) {
19308 if (TC.Type == InlineAsm::isOutput) {
19310 const TargetRegisterClass *RC =
19311 getRegForInlineAsmConstraint(SIRI, TC.ConstraintCode,
19312 TC.ConstraintVT)
19313 .second;
19314 if (RC && SIRI->isSGPRClass(RC))
19315 return true;
19316 }
19317 }
19318 }
19319 }
19321 return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
19322}
19323
19325 for (SDUse &Use : N->uses()) {
19327 if (getBasePtrIndex(M) == Use.getOperandNo())
19328 return true;
19329 }
19330 }
19331 return false;
19332}
19333
19335 SDValue N1) const {
19336 if (!N0.hasOneUse())
19337 return false;
19338 // Take care of the opportunity to keep N0 uniform
19339 if (N0->isDivergent() || !N1->isDivergent())
19340 return true;
19341 // Check if we have a good chance to form the memory access pattern with the
19342 // base and offset
19343 return (DAG.isBaseWithConstantOffset(N0) &&
19345}
19346
19348 Register N0, Register N1) const {
19349 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
19350}
19351
19354 // Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load.
19356 if (I.getMetadata("amdgpu.noclobber"))
19357 Flags |= MONoClobber;
19358 if (I.getMetadata("amdgpu.last.use"))
19359 Flags |= MOLastUse;
19360 return Flags;
19361}
19362
19364 Instruction *AI) const {
19365 // Given: atomicrmw fadd ptr %addr, float %val ordering
19366 //
19367 // With this expansion we produce the following code:
19368 // [...]
19369 // %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %addr)
19370 // br i1 %is.shared, label %atomicrmw.shared, label %atomicrmw.check.private
19371 //
19372 // atomicrmw.shared:
19373 // %cast.shared = addrspacecast ptr %addr to ptr addrspace(3)
19374 // %loaded.shared = atomicrmw fadd ptr addrspace(3) %cast.shared,
19375 // float %val ordering
19376 // br label %atomicrmw.phi
19377 //
19378 // atomicrmw.check.private:
19379 // %is.private = call i1 @llvm.amdgcn.is.private(ptr %int8ptr)
19380 // br i1 %is.private, label %atomicrmw.private, label %atomicrmw.global
19381 //
19382 // atomicrmw.private:
19383 // %cast.private = addrspacecast ptr %addr to ptr addrspace(5)
19384 // %loaded.private = load float, ptr addrspace(5) %cast.private
19385 // %val.new = fadd float %loaded.private, %val
19386 // store float %val.new, ptr addrspace(5) %cast.private
19387 // br label %atomicrmw.phi
19388 //
19389 // atomicrmw.global:
19390 // %cast.global = addrspacecast ptr %addr to ptr addrspace(1)
19391 // %loaded.global = atomicrmw fadd ptr addrspace(1) %cast.global,
19392 // float %val ordering
19393 // br label %atomicrmw.phi
19394 //
19395 // atomicrmw.phi:
19396 // %loaded.phi = phi float [ %loaded.shared, %atomicrmw.shared ],
19397 // [ %loaded.private, %atomicrmw.private ],
19398 // [ %loaded.global, %atomicrmw.global ]
19399 // br label %atomicrmw.end
19400 //
19401 // atomicrmw.end:
19402 // [...]
19403 //
19404 //
19405 // For 64-bit atomics which may reside in private memory, we perform a simpler
19406 // version that only inserts the private check, and uses the flat operation.
19407
19408 IRBuilder<> Builder(AI);
19409 LLVMContext &Ctx = Builder.getContext();
19410
19411 auto *RMW = dyn_cast<AtomicRMWInst>(AI);
19412 const unsigned PtrOpIdx = RMW ? AtomicRMWInst::getPointerOperandIndex()
19414 Value *Addr = AI->getOperand(PtrOpIdx);
19415
19416 /// TODO: Only need to check private, then emit flat-known-not private (no
19417 /// need for shared block, or cast to global).
19419
19420 Align Alignment;
19421 if (RMW)
19422 Alignment = RMW->getAlign();
19423 else if (CX)
19424 Alignment = CX->getAlign();
19425 else
19426 llvm_unreachable("unhandled atomic operation");
19427
19428 // FullFlatEmulation is true if we need to issue the private, shared, and
19429 // global cases.
19430 //
19431 // If this is false, we are only dealing with the flat-targeting-private case,
19432 // where we only insert a check for private and still use the flat instruction
19433 // for global and shared.
19434
19435 bool FullFlatEmulation =
19436 RMW && RMW->getOperation() == AtomicRMWInst::FAdd &&
19437 ((Subtarget->hasAtomicFaddInsts() && RMW->getType()->isFloatTy()) ||
19438 (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() &&
19439 RMW->getType()->isDoubleTy()));
19440
19441 // If the return value isn't used, do not introduce a false use in the phi.
19442 bool ReturnValueIsUsed = !AI->use_empty();
19443
19444 BasicBlock *BB = Builder.GetInsertBlock();
19445 Function *F = BB->getParent();
19446 BasicBlock *ExitBB =
19447 BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
19448 BasicBlock *SharedBB = nullptr;
19449
19450 BasicBlock *CheckPrivateBB = BB;
19451 if (FullFlatEmulation) {
19452 SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
19453 CheckPrivateBB =
19454 BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
19455 }
19456
19457 BasicBlock *PrivateBB =
19458 BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB);
19459 BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
19460 BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB);
19461
19462 std::prev(BB->end())->eraseFromParent();
19463 Builder.SetInsertPoint(BB);
19464
19465 Value *LoadedShared = nullptr;
19466 if (FullFlatEmulation) {
19467 CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared,
19468 {Addr}, nullptr, "is.shared");
19469 Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
19470 Builder.SetInsertPoint(SharedBB);
19471 Value *CastToLocal = Builder.CreateAddrSpaceCast(
19473
19474 Instruction *Clone = AI->clone();
19475 Clone->insertInto(SharedBB, SharedBB->end());
19476 Clone->getOperandUse(PtrOpIdx).set(CastToLocal);
19477 LoadedShared = Clone;
19478
19479 Builder.CreateBr(PhiBB);
19480 Builder.SetInsertPoint(CheckPrivateBB);
19481 }
19482
19483 CallInst *IsPrivate = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_private,
19484 {Addr}, nullptr, "is.private");
19485 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
19486
19487 Builder.SetInsertPoint(PrivateBB);
19488
19489 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
19491
19492 Value *LoadedPrivate;
19493 if (RMW) {
19494 LoadedPrivate = Builder.CreateAlignedLoad(
19495 RMW->getType(), CastToPrivate, RMW->getAlign(), "loaded.private");
19496
19497 Value *NewVal = buildAtomicRMWValue(RMW->getOperation(), Builder,
19498 LoadedPrivate, RMW->getValOperand());
19499
19500 Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign());
19501 } else {
19502 auto [ResultLoad, Equal] =
19503 buildCmpXchgValue(Builder, CastToPrivate, CX->getCompareOperand(),
19504 CX->getNewValOperand(), CX->getAlign());
19505
19506 Value *Insert = Builder.CreateInsertValue(PoisonValue::get(CX->getType()),
19507 ResultLoad, 0);
19508 LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1);
19509 }
19510
19511 Builder.CreateBr(PhiBB);
19512
19513 Builder.SetInsertPoint(GlobalBB);
19514
19515 // Continue using a flat instruction if we only emitted the check for private.
19516 Instruction *LoadedGlobal = AI;
19517 if (FullFlatEmulation) {
19518 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
19520 AI->getOperandUse(PtrOpIdx).set(CastToGlobal);
19521 }
19522
19523 AI->removeFromParent();
19524 AI->insertInto(GlobalBB, GlobalBB->end());
19525
19526 // The new atomicrmw may go through another round of legalization later.
19527 if (!FullFlatEmulation) {
19528 // We inserted the runtime check already, make sure we do not try to
19529 // re-expand this.
19530 // TODO: Should union with any existing metadata.
19531 MDBuilder MDB(F->getContext());
19532 MDNode *RangeNotPrivate =
19535 LoadedGlobal->setMetadata(LLVMContext::MD_noalias_addrspace,
19536 RangeNotPrivate);
19537 }
19538
19539 Builder.CreateBr(PhiBB);
19540
19541 Builder.SetInsertPoint(PhiBB);
19542
19543 if (ReturnValueIsUsed) {
19544 PHINode *Loaded = Builder.CreatePHI(AI->getType(), 3);
19545 AI->replaceAllUsesWith(Loaded);
19546 if (FullFlatEmulation)
19547 Loaded->addIncoming(LoadedShared, SharedBB);
19548 Loaded->addIncoming(LoadedPrivate, PrivateBB);
19549 Loaded->addIncoming(LoadedGlobal, GlobalBB);
19550 Loaded->takeName(AI);
19551 }
19552
19553 Builder.CreateBr(ExitBB);
19554}
19555
19557 unsigned PtrOpIdx) {
19558 Value *PtrOp = I->getOperand(PtrOpIdx);
19561
19562 Type *FlatPtr = PointerType::get(I->getContext(), AMDGPUAS::FLAT_ADDRESS);
19563 Value *ASCast = CastInst::CreatePointerCast(PtrOp, FlatPtr, "scratch.ascast",
19564 I->getIterator());
19565 I->setOperand(PtrOpIdx, ASCast);
19566}
19567
19570
19573
19576 if (const auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
19577 ConstVal && ConstVal->isNullValue()) {
19578 // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
19580
19581 // We may still need the private-alias-flat handling below.
19582
19583 // TODO: Skip this for cases where we cannot access remote memory.
19584 }
19585 }
19586
19587 // The non-flat expansions should only perform the de-canonicalization of
19588 // identity values.
19590 return;
19591
19593}
19594
19601
19605
19607 "Expand Atomic Load only handles SCRATCH -> FLAT conversion");
19608}
19609
19611 if (SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
19612 return convertScratchAtomicToFlatAtomic(SI, SI->getPointerOperandIndex());
19613
19615 "Expand Atomic Store only handles SCRATCH -> FLAT conversion");
19616}
19617
19618LoadInst *
19620 IRBuilder<> Builder(AI);
19621 auto Order = AI->getOrdering();
19622
19623 // The optimization removes store aspect of the atomicrmw. Therefore, cache
19624 // must be flushed if the atomic ordering had a release semantics. This is
19625 // not necessary a fence, a release fence just coincides to do that flush.
19626 // Avoid replacing of an atomicrmw with a release semantics.
19627 if (isReleaseOrStronger(Order))
19628 return nullptr;
19629
19630 LoadInst *LI = Builder.CreateAlignedLoad(
19631 AI->getType(), AI->getPointerOperand(), AI->getAlign());
19632 LI->setAtomic(Order, AI->getSyncScopeID());
19633 LI->copyMetadata(*AI);
19634 LI->takeName(AI);
19635 AI->replaceAllUsesWith(LI);
19636 AI->eraseFromParent();
19637 return LI;
19638}
static bool isMul(MachineInstr *MI)
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
return SDValue()
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
static bool allUsesHaveSourceMods(MachineInstr &MI, MachineRegisterInfo &MRI, unsigned CostThreshold=4)
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
constexpr LLT S32
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
dxil translate DXIL Translate Metadata
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
#define DEBUG_TYPE
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
iv Induction Variable Users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define RegName(no)
static LVOptions Options
Definition LVOptions.cpp:25
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Contains matchers for matching SSA Machine Instructions.
Machine Check Debug Module
static bool isUndef(const MachineInstr &MI)
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static unsigned getAddressSpace(const Value *V, unsigned MaxLookup)
uint64_t IntrinsicInst * II
#define P(N)
static constexpr MCPhysReg SPReg
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
Contains matchers for matching SelectionDAG nodes and values.
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:39
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:57
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:51
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:45
#define FP_DENORM_FLUSH_NONE
Definition SIDefines.h:1260
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
Definition SIDefines.h:1257
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static void getCoopAtomicOperandsInfo(const CallBase &CI, bool IsLoad, TargetLoweringBase::IntrinsicInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static bool isAtomicRMWLegalIntTy(Type *Ty)
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelValueTracking &VT, KnownBits &Known, unsigned Dim)
static bool flatInstrMayAccessPrivate(const Instruction *I)
Return if a flat address space atomicrmw can access private memory.
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, EVT VT)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static uint64_t getIdentityValueFor64BitWaveReduction(unsigned Opc)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget, const AtomicRMWInst *RMW, bool HasSystemScope)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static bool is32bitWaveReduceOperation(unsigned Opc)
static TargetLowering::AtomicExpansionKind atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW)
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static void convertScratchAtomicToFlatAtomic(Instruction *I, unsigned PtrOpIdx)
static bool isCopyFromRegOfInlineAsm(const SDNode *N)
static bool elementPairIsOddToEven(ArrayRef< int > Mask, int Elt)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isFloatingPointWaveReduceOperation(unsigned Opc)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static TargetLowering::AtomicExpansionKind getPrivateAtomicExpansionKind(const GCNSubtarget &STI)
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static bool isV2BF16(Type *Ty)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static SDValue lowerWaveShuffle(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static void knownBitsForSBFE(const MachineInstr &MI, GISelValueTracking &VT, KnownBits &Known, const APInt &DemandedElts, unsigned BFEWidth, bool SExt, unsigned Depth)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static MachineBasicBlock * Expand64BitScalarArithmetic(MachineInstr &MI, MachineBasicBlock *BB)
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static unsigned getExtOpcodeForPromotedOp(SDValue Op)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL, SDValue MulLHS, SDValue MulRHS, SDValue AddRHS)
static unsigned getIntrMemWidth(unsigned IntrID)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static bool isV2F16OrV2BF16(Type *Ty)
static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static bool isV2F16(Type *Ty)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
Interface definition for SIRegisterInfo.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:487
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
LLVM IR instance of the generic uniformity analysis.
static constexpr int Concat[]
Value * RHS
Value * LHS
The Input class is used to parse a yaml document into in-memory structs and vectors.
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo)
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
unsigned getWavefrontSize() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue LowerF64ToF16Safe(SDValue Src, const SDLoc &DL, SelectionDAG &DAG) const
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
AMDGPUTargetLowering(const TargetMachine &TM, const TargetSubtargetInfo &STI, const AMDGPUSubtarget &AMDGPUSTI)
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
static int64_t getNullPointerValue(unsigned AddrSpace)
Get the integer value of a null pointer in the given address space.
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
const std::array< unsigned, 3 > & getDims() const
static const LaneMaskConstants & get(const GCNSubtarget &ST)
static constexpr roundingMode rmNearestTiesToEven
Definition APFloat.h:344
static const fltSemantics & IEEEhalf()
Definition APFloat.h:294
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
Definition APFloat.h:1183
LLVM_ABI opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition APFloat.cpp:5975
LLVM_READONLY int getExactLog2Abs() const
Definition APFloat.h:1560
bool isNegative() const
Definition APFloat.h:1512
bool isNormal() const
Definition APFloat.h:1516
APInt bitcastToAPInt() const
Definition APFloat.h:1416
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
Definition APFloat.h:1201
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition APFloat.h:1161
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition APFloat.h:1142
bool isInfinity() const
Definition APFloat.h:1509
Class for arbitrary precision integers.
Definition APInt.h:78
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition APInt.h:1406
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition APInt.h:1400
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition APInt.h:259
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:381
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition APInt.h:467
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition APInt.h:1654
bool isOneBitSet(unsigned BitNo) const
Determine if this APInt Value only has the specified bit set.
Definition APInt.h:367
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition APInt.h:1244
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1228
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
LLVM_ABI bool hasAttribute(Attribute::AttrKind Kind) const
Check if an argument has a given attribute.
Definition Function.cpp:338
const Function * getParent() const
Definition Argument.h:44
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
An instruction that atomically checks whether a specified value is in a memory location,...
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
an instruction that atomically reads a memory location, combines it with another value,...
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Add
*p = old + v
@ FAdd
*p = old + v
@ USubCond
Subtract only if no unsigned overflow.
@ Min
*p = old <signed v ? old : v
@ Sub
*p = old - v
@ And
*p = old & v
@ Xor
*p = old ^ v
@ USubSat
*p = usub.sat(old, v) usub.sat matches the behavior of llvm.usub.sat.
@ FSub
*p = old - v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
@ Nand
*p = ~(old & v)
Value * getPointerOperand()
void setOperation(BinOp Operation)
BinOp getOperation() const
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
static LLVM_ABI StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
This class holds the attributes for a particular argument, parameter, function, or return value.
Definition Attributes.h:402
LLVM_ABI MemoryEffects getMemoryEffects() const
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator end()
Definition BasicBlock.h:483
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition BasicBlock.h:206
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
static ByteProvider getConstantZero()
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
std::optional< ISelOp > Src
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static LLVM_ABI bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
LLVM_ABI void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
LLVM_ABI bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
LLVM_ABI void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
LLVM_ABI void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
LLVM_ABI void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
int64_t getLocMemOffset() const
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
LLVM_ABI bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
static LLVM_ABI CastInst * CreatePointerCast(Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Create a BitCast, AddrSpaceCast or a PtrToInt cast instruction.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ ICMP_NE
not equal
Definition InstrTypes.h:698
bool isSigned() const
Definition InstrTypes.h:930
static bool isFPPredicate(Predicate P)
Definition InstrTypes.h:770
static bool isIntPredicate(Predicate P)
Definition InstrTypes.h:776
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
Definition Constants.h:87
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition Constants.h:219
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
Definition Constant.h:43
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
LLVM_ABI Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
bool isBigEndian() const
Definition DataLayout.h:215
A debug info location.
Definition DebugLoc.h:123
Diagnostic information for unsupported feature in backend.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:309
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Register DemoteRegister
DemoteRegister - if CanLowerReturn is false, DemoteRegister is a vreg allocated to hold a pointer to ...
const Value * getValueFromVirtualReg(Register Vreg)
This method is called from TargetLowerinInfo::isSDNodeSourceOfDivergence to get the Value correspondi...
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:209
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition Function.cpp:362
iterator_range< arg_iterator > args()
Definition Function.h:896
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:764
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:358
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Definition Function.cpp:805
Argument * getArg(unsigned i) const
Definition Function.h:890
const SIInstrInfo * getInstrInfo() const override
bool hasMadF16() const
const SIRegisterInfo * getRegisterInfo() const override
bool hasMin3Max3_16() const
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
bool supportsWaveWideBPermute() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
bool isWave64() const
bool hasPrivateSegmentBuffer() const
const MachineFunction & getMachineFunction() const
void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
bool isDivergent(ConstValueRefT V) const
Whether V is divergent at its definition.
LLVM_ABI unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
unsigned getAddressSpace() const
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this global belongs to.
Definition Globals.cpp:133
Type * getValueType() const
LLVM_ABI uint64_t getGlobalSize(const DataLayout &DL) const
Get the size of this global variable in bytes.
Definition Globals.cpp:561
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2775
LLVM_ABI Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following:
LLVM_ABI void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
LLVM_ABI InstListType::iterator insertInto(BasicBlock *ParentBB, InstListType::iterator It)
Inserts an unlinked instruction into ParentBB at position It and returns the iterator of the inserted...
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
constexpr unsigned getScalarSizeInBits() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void emitError(const Instruction *I, const Twine &ErrorStr)
emitError - Emit an error message to the currently installed error handler with optional location inf...
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LLVM_ABI SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
static unsigned getPointerOperandIndex()
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:41
LLVM_ABI MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
Definition MDBuilder.cpp:96
Metadata node.
Definition Metadata.h:1080
const MDOperand & getOperand(unsigned I) const
Definition Metadata.h:1444
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
Machine Value Type.
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
PseudoSourceValueManager & getPSVManager() const
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI void clearKillFlags(Register Reg) const
clearKillFlags - Iterate over all the uses of the given register and clear the kill flag from the Mac...
LLVM_ABI void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
bool isInvariant() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
Definition ModRef.h:226
bool doesNotAccessMemory() const
Whether this function accesses no memory.
Definition ModRef.h:220
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
Definition ModRef.h:223
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition Module.h:278
The optimization diagnostic interface.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
LLVM_ABI const PseudoSourceValue * getConstantPool()
Return a pseudo source value referencing the constant pool.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
Definition Register.h:72
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool isDivergent() const
bool hasOneUse() const
Return true if there is exactly one use of this node.
value_iterator value_end() const
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
op_iterator op_end() const
bool isAnyAdd() const
Returns true if the node type is ADD or PTRADD.
value_iterator value_begin() const
op_iterator op_begin() const
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
AMDGPU::ClusterDimsAttr getClusterDims() const
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(const AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
SDValue lowerROTR(SDValue Op, SelectionDAG &DAG) const
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::amdgpuBufferFatPointer because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
void emitExpandAtomicStore(StoreInst *SI) const override
Perform a atomic store using a target-specific way.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
Align computeKnownAlignForTargetInstr(GISelValueTracking &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void emitExpandAtomicLoad(LoadInst *LI) const override
Perform a atomic load using a target-specific way.
EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool shouldPreservePtrArith(const Function &F, EVT PtrVT) const override
True if target has some particular form of dealing with pointer arithmetic semantics for pointers wit...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool getAddrModeArguments(const IntrinsicInst *I, SmallVectorImpl< Value * > &Ops, Type *&AccessTy) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override
Perform a cmpxchg expansion using a target-specific method.
bool canTransformPtrArithOutOfBounds(const Function &F, EVT PtrVT) const override
True if the target allows transformations of in-bounds pointer arithmetic that cause out-of-bounds in...
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallBase &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
bool isExtractVecEltCheap(EVT VT, unsigned Index) const override
Return true if extraction of a scalar element from the given vector type at the given index is cheap.
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
void computeKnownBitsForTargetInstr(GISelValueTracking &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(const AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachinePointerInfo getKernargSegmentPtrInfo(MachineFunction &MF) const
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context, const Type *RetTy) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
SDValue getExtractVectorElt(const SDLoc &DL, EVT VT, SDValue Vec, unsigned Idx)
Extract element at Idx from Vec.
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
bool isKnownNeverSNaN(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
const TargetSubtargetInfo & getSubtarget() const
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
const Pass * getPass() const
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
LLVM_ABI SDValue getAtomicLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT MemVT, EVT VT, SDValue Chain, SDValue Ptr, MachineMemOperand *MMO)
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
LLVM_ABI bool isConstantIntBuildVectorOrConstantInt(SDValue N, bool AllowOpaques=true) const
Test whether the given value is a constant int or similar node.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI bool SignBitIsZeroFP(SDValue Op, unsigned Depth=0) const
Return true if the sign bit of Op is known to be zero, for a floating-point value.
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
LLVM_ABI SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
LLVM_ABI SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
LLVM_ABI std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
MachineFunctionAnalysisManager * getMFAM()
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
const TargetMachine & getTarget() const
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getPOISON(EVT VT)
Return a POISON node. POISON does not have a useful SDLoc.
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getRegisterMask(const uint32_t *RegMask)
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
constexpr bool empty() const
empty - Check if the string is empty.
Definition StringRef.h:143
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:146
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
Type * Ty
Same as OrigTy, or partially legalized for soft float libcalls.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
SDValue expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminimumnum/fmaximumnum into multiple comparison with selects.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
TargetLowering(const TargetLowering &)=delete
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
SDValue expandRoundInexactToOdd(EVT ResultVT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG) const
Truncate Op to ResultVT.
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
TargetOptions Options
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getNumRegs() const
Return the number of registers in this class.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
OSType getOS() const
Get the parsed operating system type of this triple.
Definition Triple.h:427
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:296
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition Type.h:145
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
bool isFunctionTy() const
True if this is an instance of FunctionType.
Definition Type.h:258
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
LLVM_ABI const fltSemantics & getFltSemantics() const
Definition Type.cpp:106
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM_ABI unsigned getOperandNo() const
Return the operand # of this use in its User.
Definition Use.cpp:35
LLVM_ABI void set(Value *Val)
Definition Value.h:905
User * getUser() const
Returns the User that contains this Use.
Definition Use.h:61
const Use & getOperandUse(unsigned i) const
Definition User.h:220
Value * getOperand(unsigned i) const
Definition User.h:207
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:553
LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.h:259
iterator_range< user_iterator > users()
Definition Value.h:426
bool use_empty() const
Definition Value.h:346
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:403
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
self_iterator getIterator()
Definition ilist_node.h:123
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
LLVM_READNONE constexpr bool isShader(CallingConv::ID CC)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
const uint64_t FltRoundToHWConversionTable
bool isGFX12Plus(const MCSubtargetInfo &STI)
unsigned getNSAMaxSize(const MCSubtargetInfo &STI, bool HasSampler)
bool isGFX11(const MCSubtargetInfo &STI)
bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val)
Checks if Val is inside MD, a !range-like metadata.
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
LLVM_READNONE constexpr bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool getMUBUFTfe(unsigned Opc)
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
std::tuple< char, unsigned, unsigned > parseAsmConstraintPhysReg(StringRef Constraint)
Returns a valid charcode or 0 in the first entry if this is a valid physical register constraint.
bool isGFX10Plus(const MCSubtargetInfo &STI)
LLVM_READONLY int getVOPe64(uint16_t Opcode)
bool isUniformMMO(const MachineMemOperand *MMO)
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds)
Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
bool isExtendedGlobalAddrSpace(unsigned AS)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
LLVM_READNONE constexpr bool isChainCC(CallingConv::ID CC)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool canGuaranteeTCO(CallingConv::ID CC)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:819
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition ISDOpcodes.h:261
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
@ CTLZ_ZERO_UNDEF
Definition ISDOpcodes.h:788
@ PTRADD
PTRADD represents pointer arithmetic semantics, for targets that opt in using shouldPreservePtrArith(...
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition ISDOpcodes.h:45
@ SET_FPENV
Sets the current floating-point environment.
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:275
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:600
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:779
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, val, ptr) This corresponds to "store atomic" instruction.
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition ISDOpcodes.h:522
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:853
@ ATOMIC_LOAD_USUB_COND
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:518
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:220
@ GlobalAddress
Definition ISDOpcodes.h:88
@ ATOMIC_CMP_SWAP_WITH_SUCCESS
Val, Success, OUTCHAIN = ATOMIC_CMP_SWAP_WITH_SUCCESS(INCHAIN, ptr, cmp, swap) N.b.
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:880
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:584
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:417
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:747
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition ISDOpcodes.h:993
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:254
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ ATOMIC_LOAD_USUB_SAT
@ SET_ROUNDING
Set rounding mode.
Definition ISDOpcodes.h:975
@ CONVERGENCECTRL_GLUE
This does not correspond to any convergence control intrinsic.
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:844
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition ISDOpcodes.h:665
@ READSTEADYCOUNTER
READSTEADYCOUNTER - This corresponds to the readfixedcounter intrinsic.
@ BR
Control flow instructions. These all have token chains.
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition ISDOpcodes.h:787
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ BR_CC
BR_CC - Conditional branch.
@ SSUBO
Same for subtraction.
Definition ISDOpcodes.h:352
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition ISDOpcodes.h:541
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
Definition ISDOpcodes.h:548
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:374
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:796
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
@ UNDEF
UNDEF - An undefined node.
Definition ISDOpcodes.h:233
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition ISDOpcodes.h:247
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:230
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:348
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition ISDOpcodes.h:970
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:704
@ GET_FPMODE
Reads the current dynamic floating-point control modes.
@ GET_FPENV
Gets the current floating-point environment.
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:765
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:649
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:614
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:576
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:224
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:850
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:811
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum maximum on two values, following IEEE-754 definition...
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:356
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:888
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:727
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:978
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:805
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:328
@ INLINEASM_BR
INLINEASM_BR - Branching version of inline asm. Used by asm-goto.
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
@ ATOMIC_LOAD_UDEC_WRAP
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition ISDOpcodes.h:500
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:926
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:505
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:739
@ TRAP
TRAP - Trapping instruction.
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:205
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:565
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:53
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
@ ExternalSymbol
Definition ISDOpcodes.h:93
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:959
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
Definition ISDOpcodes.h:997
@ INLINEASM
INLINEASM - Represents an inline asm block.
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition ISDOpcodes.h:945
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:856
@ BRCOND
BRCOND - Conditional branch.
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition ISDOpcodes.h:833
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition ISDOpcodes.h:62
@ ATOMIC_LOAD_UINC_WRAP
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:534
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:365
@ FMINIMUMNUM
FMINIMUMNUM/FMAXIMUMNUM - minimumnum/maximumnum that is same with FMINNUM_IEEE and FMAXNUM_IEEE besid...
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:213
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:556
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getDeclarationIfExists(const Module *M, ID id)
Look up the Function declaration of the intrinsic id in the Module M and return it if it exists.
LLVM_ABI AttributeSet getFnAttributes(LLVMContext &C, ID id)
Return the function attributes for an intrinsic.
LLVM_ABI AttributeList getAttributes(LLVMContext &C, ID id, FunctionType *FT)
Return the attributes for an intrinsic.
LLVM_ABI FunctionType * getType(LLVMContext &Context, ID id, ArrayRef< Type * > Tys={})
Return the function type for an intrinsic.
BinaryOp_match< SpecificConstantMatch, SrcTy, TargetOpcode::G_SUB > m_Neg(const SrcTy &&Src)
Matches a register negated by a G_SUB.
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
bool sd_match(SDNode *N, const SelectionDAG *DAG, Pattern &&P)
Offsets
Offsets in bytes from the start of the input buffer.
@ System
Synchronized with respect to all concurrently executing threads.
Definition LLVMContext.h:58
initializer< Ty > init(const Ty &Val)
constexpr double inv_pi
@ User
could "use" a pointer
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
NodeAddr< NodeBase * > Node
Definition RDFGraph.h:381
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
GenericUniformityInfo< SSAContext > UniformityInfo
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Offset
Definition DWP.cpp:532
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
Definition Analysis.cpp:237
LLVM_ABI void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737
OuterAnalysisManagerProxy< ModuleAnalysisManager, MachineFunction > ModuleAnalysisManagerMachineFunctionProxy
Provide the ModuleAnalysisManager to Function proxy.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition STLExtras.h:839
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
std::pair< Value *, Value * > buildCmpXchgValue(IRBuilderBase &Builder, Value *Ptr, Value *Cmp, Value *Val, Align Alignment)
Emit IR to implement the given cmpxchg operation on values in registers, returning the new value.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ Define
Register definition.
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
@ Done
Definition Threading.h:60
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
constexpr int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
Definition MathExtras.h:223
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition bit.h:303
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2198
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:546
MemoryEffectsBase< IRMemLocation > MemoryEffects
Summary of how a function affects memory in the program.
Definition ModRef.h:301
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:202
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition MathExtras.h:273
bool isReleaseOrStronger(AtomicOrdering AO)
constexpr T MinAlign(U A, V B)
A and B are either alignments or offsets.
Definition MathExtras.h:357
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition SIInstrInfo.h:44
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1744
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
AtomicOrderingCABI
Atomic ordering for C11 / C++11's memory models.
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition bit.h:236
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
Definition Analysis.cpp:203
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
Definition SIInstrInfo.h:52
Value * buildAtomicRMWValue(AtomicRMWInst::BinOp Op, IRBuilderBase &Builder, Value *Loaded, Value *Val)
Emit IR to implement the given atomicrmw operation on values in registers, returning the new value.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:74
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
@ AfterLegalizeDAG
Definition DAGCombine.h:19
@ AfterLegalizeVectorOps
Definition DAGCombine.h:18
@ AfterLegalizeTypes
Definition DAGCombine.h:17
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ Add
Sum of integers.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition VE.h:376
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
Definition MathExtras.h:232
constexpr unsigned BitWidth
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition Utils.cpp:434
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1770
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition SIInstrInfo.h:48
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1945
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
constexpr RegState getUndefRegState(bool B)
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
int64_t DWordOffset
int64_t PermMask
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
static constexpr uint64_t encode(Fields... Values)
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Definition SCCPSolver.h:42
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
Extended Value Type.
Definition ValueTypes.h:35
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition ValueTypes.h:395
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition ValueTypes.h:121
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition ValueTypes.h:300
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:147
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition ValueTypes.h:243
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition ValueTypes.h:470
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition ValueTypes.h:412
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition ValueTypes.h:65
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition ValueTypes.h:256
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:328
EVT changeElementType(LLVMContext &Context, EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
Definition ValueTypes.h:113
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:157
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336
unsigned getPointerAddrSpace() const
unsigned getByValSize() const
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
MVT VT
Legalized type of this argument part.
unsigned getOrigArgIndex() const
OutputArg - This struct carries flags and a value for a single outgoing (actual) argument or outgoing...
bool isUnknown() const
Returns true if we don't know any bits.
Definition KnownBits.h:66
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
Definition KnownBits.h:175
void resetAll()
Resets the known state of all bits.
Definition KnownBits.h:74
KnownBits extractBits(unsigned NumBits, unsigned BitPosition) const
Return a subset of the known bits from [bitPosition,bitPosition+numBits).
Definition KnownBits.h:238
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
Definition KnownBits.h:183
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:360
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition KnownBits.h:261
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
bool hasNoSignedWrap() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
unsigned int NumVTs
DenormalMode FP64FP16Denormals
If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs