LLVM 19.0.0git
AMDGPUISelDAGToDAG.cpp
Go to the documentation of this file.
1//===-- AMDGPUISelDAGToDAG.cpp - A dag to dag inst selector for AMDGPU ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//==-----------------------------------------------------------------------===//
8//
9/// \file
10/// Defines an instruction selector for the AMDGPU target.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPUISelDAGToDAG.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPUSubtarget.h"
18#include "AMDGPUTargetMachine.h"
21#include "R600RegisterInfo.h"
22#include "SIISelLowering.h"
30#include "llvm/IR/IntrinsicsAMDGPU.h"
33
34#ifdef EXPENSIVE_CHECKS
36#include "llvm/IR/Dominators.h"
37#endif
38
39#define DEBUG_TYPE "amdgpu-isel"
40
41using namespace llvm;
42
43//===----------------------------------------------------------------------===//
44// Instruction Selector Implementation
45//===----------------------------------------------------------------------===//
46
47namespace {
48static SDValue stripBitcast(SDValue Val) {
49 return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val;
50}
51
52// Figure out if this is really an extract of the high 16-bits of a dword.
53static bool isExtractHiElt(SDValue In, SDValue &Out) {
54 In = stripBitcast(In);
55
56 if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
57 if (ConstantSDNode *Idx = dyn_cast<ConstantSDNode>(In.getOperand(1))) {
58 if (!Idx->isOne())
59 return false;
60 Out = In.getOperand(0);
61 return true;
62 }
63 }
64
65 if (In.getOpcode() != ISD::TRUNCATE)
66 return false;
67
68 SDValue Srl = In.getOperand(0);
69 if (Srl.getOpcode() == ISD::SRL) {
70 if (ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
71 if (ShiftAmt->getZExtValue() == 16) {
72 Out = stripBitcast(Srl.getOperand(0));
73 return true;
74 }
75 }
76 }
77
78 return false;
79}
80
81// Look through operations that obscure just looking at the low 16-bits of the
82// same register.
83static SDValue stripExtractLoElt(SDValue In) {
84 if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
85 SDValue Idx = In.getOperand(1);
86 if (isNullConstant(Idx) && In.getValueSizeInBits() <= 32)
87 return In.getOperand(0);
88 }
89
90 if (In.getOpcode() == ISD::TRUNCATE) {
91 SDValue Src = In.getOperand(0);
92 if (Src.getValueType().getSizeInBits() == 32)
93 return stripBitcast(Src);
94 }
95
96 return In;
97}
98
99} // end anonymous namespace
100
102 "AMDGPU DAG->DAG Pattern Instruction Selection", false, false)
106#ifdef EXPENSIVE_CHECKS
109#endif
111 "AMDGPU DAG->DAG Pattern Instruction Selection", false, false)
112
113/// This pass converts a legalized DAG into a AMDGPU-specific
114// DAG, ready for instruction scheduling.
116 CodeGenOptLevel OptLevel) {
117 return new AMDGPUDAGToDAGISel(TM, OptLevel);
118}
119
121 CodeGenOptLevel OptLevel)
122 : SelectionDAGISel(ID, TM, OptLevel) {
123 EnableLateStructurizeCFG = AMDGPUTargetMachine::EnableLateStructurizeCFG;
124}
125
127#ifdef EXPENSIVE_CHECKS
128 DominatorTree & DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
129 LoopInfo * LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
130 for (auto &L : LI->getLoopsInPreorder()) {
131 assert(L->isLCSSAForm(DT));
132 }
133#endif
134 Subtarget = &MF.getSubtarget<GCNSubtarget>();
136 Mode = SIModeRegisterDefaults(MF.getFunction(), *Subtarget);
138}
139
140bool AMDGPUDAGToDAGISel::fp16SrcZerosHighBits(unsigned Opc) const {
141 // XXX - only need to list legal operations.
142 switch (Opc) {
143 case ISD::FADD:
144 case ISD::FSUB:
145 case ISD::FMUL:
146 case ISD::FDIV:
147 case ISD::FREM:
149 case ISD::UINT_TO_FP:
150 case ISD::SINT_TO_FP:
151 case ISD::FABS:
152 // Fabs is lowered to a bit operation, but it's an and which will clear the
153 // high bits anyway.
154 case ISD::FSQRT:
155 case ISD::FSIN:
156 case ISD::FCOS:
157 case ISD::FPOWI:
158 case ISD::FPOW:
159 case ISD::FLOG:
160 case ISD::FLOG2:
161 case ISD::FLOG10:
162 case ISD::FEXP:
163 case ISD::FEXP2:
164 case ISD::FCEIL:
165 case ISD::FTRUNC:
166 case ISD::FRINT:
167 case ISD::FNEARBYINT:
168 case ISD::FROUNDEVEN:
169 case ISD::FROUND:
170 case ISD::FFLOOR:
171 case ISD::FMINNUM:
172 case ISD::FMAXNUM:
173 case ISD::FLDEXP:
174 case AMDGPUISD::FRACT:
175 case AMDGPUISD::CLAMP:
178 case AMDGPUISD::FMIN3:
179 case AMDGPUISD::FMAX3:
180 case AMDGPUISD::FMED3:
182 case AMDGPUISD::RCP:
183 case AMDGPUISD::RSQ:
185 // On gfx10, all 16-bit instructions preserve the high bits.
186 return Subtarget->getGeneration() <= AMDGPUSubtarget::GFX9;
187 case ISD::FP_ROUND:
188 // We may select fptrunc (fma/mad) to mad_mixlo, which does not zero the
189 // high bits on gfx9.
190 // TODO: If we had the source node we could see if the source was fma/mad
192 case ISD::FMA:
193 case ISD::FMAD:
196 default:
197 // fcopysign, select and others may be lowered to 32-bit bit operations
198 // which don't zero the high bits.
199 return false;
200 }
201}
202
206#ifdef EXPENSIVE_CHECKS
209#endif
211}
212
214 assert(Subtarget->d16PreservesUnusedBits());
215 MVT VT = N->getValueType(0).getSimpleVT();
216 if (VT != MVT::v2i16 && VT != MVT::v2f16)
217 return false;
218
219 SDValue Lo = N->getOperand(0);
220 SDValue Hi = N->getOperand(1);
221
222 LoadSDNode *LdHi = dyn_cast<LoadSDNode>(stripBitcast(Hi));
223
224 // build_vector lo, (load ptr) -> load_d16_hi ptr, lo
225 // build_vector lo, (zextload ptr from i8) -> load_d16_hi_u8 ptr, lo
226 // build_vector lo, (sextload ptr from i8) -> load_d16_hi_i8 ptr, lo
227
228 // Need to check for possible indirect dependencies on the other half of the
229 // vector to avoid introducing a cycle.
230 if (LdHi && Hi.hasOneUse() && !LdHi->isPredecessorOf(Lo.getNode())) {
231 SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
232
234 SDValue Ops[] = {
235 LdHi->getChain(), LdHi->getBasePtr(), TiedIn
236 };
237
238 unsigned LoadOp = AMDGPUISD::LOAD_D16_HI;
239 if (LdHi->getMemoryVT() == MVT::i8) {
240 LoadOp = LdHi->getExtensionType() == ISD::SEXTLOAD ?
242 } else {
243 assert(LdHi->getMemoryVT() == MVT::i16);
244 }
245
246 SDValue NewLoadHi =
247 CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdHi), VTList,
248 Ops, LdHi->getMemoryVT(),
249 LdHi->getMemOperand());
250
251 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadHi);
252 CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdHi, 1), NewLoadHi.getValue(1));
253 return true;
254 }
255
256 // build_vector (load ptr), hi -> load_d16_lo ptr, hi
257 // build_vector (zextload ptr from i8), hi -> load_d16_lo_u8 ptr, hi
258 // build_vector (sextload ptr from i8), hi -> load_d16_lo_i8 ptr, hi
259 LoadSDNode *LdLo = dyn_cast<LoadSDNode>(stripBitcast(Lo));
260 if (LdLo && Lo.hasOneUse()) {
261 SDValue TiedIn = getHi16Elt(Hi);
262 if (!TiedIn || LdLo->isPredecessorOf(TiedIn.getNode()))
263 return false;
264
265 SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
266 unsigned LoadOp = AMDGPUISD::LOAD_D16_LO;
267 if (LdLo->getMemoryVT() == MVT::i8) {
268 LoadOp = LdLo->getExtensionType() == ISD::SEXTLOAD ?
270 } else {
271 assert(LdLo->getMemoryVT() == MVT::i16);
272 }
273
274 TiedIn = CurDAG->getNode(ISD::BITCAST, SDLoc(N), VT, TiedIn);
275
276 SDValue Ops[] = {
277 LdLo->getChain(), LdLo->getBasePtr(), TiedIn
278 };
279
280 SDValue NewLoadLo =
281 CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdLo), VTList,
282 Ops, LdLo->getMemoryVT(),
283 LdLo->getMemOperand());
284
285 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadLo);
286 CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdLo, 1), NewLoadLo.getValue(1));
287 return true;
288 }
289
290 return false;
291}
292
294 if (!Subtarget->d16PreservesUnusedBits())
295 return;
296
298
299 bool MadeChange = false;
300 while (Position != CurDAG->allnodes_begin()) {
301 SDNode *N = &*--Position;
302 if (N->use_empty())
303 continue;
304
305 switch (N->getOpcode()) {
307 // TODO: Match load d16 from shl (extload:i16), 16
308 MadeChange |= matchLoadD16FromBuildVector(N);
309 break;
310 default:
311 break;
312 }
313 }
314
315 if (MadeChange) {
317 LLVM_DEBUG(dbgs() << "After PreProcess:\n";
318 CurDAG->dump(););
319 }
320}
321
322bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N) const {
323 if (N->isUndef())
324 return true;
325
326 const SIInstrInfo *TII = Subtarget->getInstrInfo();
327 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N))
328 return TII->isInlineConstant(C->getAPIntValue());
329
330 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N))
331 return TII->isInlineConstant(C->getValueAPF());
332
333 return false;
334}
335
336/// Determine the register class for \p OpNo
337/// \returns The register class of the virtual register that will be used for
338/// the given operand number \OpNo or NULL if the register class cannot be
339/// determined.
340const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
341 unsigned OpNo) const {
342 if (!N->isMachineOpcode()) {
343 if (N->getOpcode() == ISD::CopyToReg) {
344 Register Reg = cast<RegisterSDNode>(N->getOperand(1))->getReg();
345 if (Reg.isVirtual()) {
347 return MRI.getRegClass(Reg);
348 }
349
350 const SIRegisterInfo *TRI
351 = static_cast<const GCNSubtarget *>(Subtarget)->getRegisterInfo();
352 return TRI->getPhysRegBaseClass(Reg);
353 }
354
355 return nullptr;
356 }
357
358 switch (N->getMachineOpcode()) {
359 default: {
360 const MCInstrDesc &Desc =
361 Subtarget->getInstrInfo()->get(N->getMachineOpcode());
362 unsigned OpIdx = Desc.getNumDefs() + OpNo;
363 if (OpIdx >= Desc.getNumOperands())
364 return nullptr;
365 int RegClass = Desc.operands()[OpIdx].RegClass;
366 if (RegClass == -1)
367 return nullptr;
368
369 return Subtarget->getRegisterInfo()->getRegClass(RegClass);
370 }
371 case AMDGPU::REG_SEQUENCE: {
372 unsigned RCID = N->getConstantOperandVal(0);
373 const TargetRegisterClass *SuperRC =
374 Subtarget->getRegisterInfo()->getRegClass(RCID);
375
376 SDValue SubRegOp = N->getOperand(OpNo + 1);
377 unsigned SubRegIdx = SubRegOp->getAsZExtVal();
378 return Subtarget->getRegisterInfo()->getSubClassWithSubReg(SuperRC,
379 SubRegIdx);
380 }
381 }
382}
383
384SDNode *AMDGPUDAGToDAGISel::glueCopyToOp(SDNode *N, SDValue NewChain,
385 SDValue Glue) const {
386 SmallVector <SDValue, 8> Ops;
387 Ops.push_back(NewChain); // Replace the chain.
388 for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
389 Ops.push_back(N->getOperand(i));
390
391 Ops.push_back(Glue);
392 return CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops);
393}
394
395SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N, SDValue Val) const {
397 *static_cast<const SITargetLowering*>(getTargetLowering());
398
399 assert(N->getOperand(0).getValueType() == MVT::Other && "Expected chain");
400
401 SDValue M0 = Lowering.copyToM0(*CurDAG, N->getOperand(0), SDLoc(N), Val);
402 return glueCopyToOp(N, M0, M0.getValue(1));
403}
404
405SDNode *AMDGPUDAGToDAGISel::glueCopyToM0LDSInit(SDNode *N) const {
406 unsigned AS = cast<MemSDNode>(N)->getAddressSpace();
407 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
408 if (Subtarget->ldsRequiresM0Init())
409 return glueCopyToM0(N, CurDAG->getTargetConstant(-1, SDLoc(N), MVT::i32));
410 } else if (AS == AMDGPUAS::REGION_ADDRESS) {
412 unsigned Value = MF.getInfo<SIMachineFunctionInfo>()->getGDSSize();
413 return
414 glueCopyToM0(N, CurDAG->getTargetConstant(Value, SDLoc(N), MVT::i32));
415 }
416 return N;
417}
418
419MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm,
420 EVT VT) const {
422 AMDGPU::S_MOV_B32, DL, MVT::i32,
423 CurDAG->getTargetConstant(Imm & 0xFFFFFFFF, DL, MVT::i32));
424 SDNode *Hi =
425 CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
426 CurDAG->getTargetConstant(Imm >> 32, DL, MVT::i32));
427 const SDValue Ops[] = {
428 CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
429 SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
430 SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
431
432 return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, VT, Ops);
433}
434
435void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
436 EVT VT = N->getValueType(0);
437 unsigned NumVectorElts = VT.getVectorNumElements();
438 EVT EltVT = VT.getVectorElementType();
439 SDLoc DL(N);
440 SDValue RegClass = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
441
442 if (NumVectorElts == 1) {
443 CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT, N->getOperand(0),
444 RegClass);
445 return;
446 }
447
448 assert(NumVectorElts <= 32 && "Vectors with more than 32 elements not "
449 "supported yet");
450 // 32 = Max Num Vector Elements
451 // 2 = 2 REG_SEQUENCE operands per element (value, subreg index)
452 // 1 = Vector Register Class
453 SmallVector<SDValue, 32 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1);
454
455 bool IsGCN = CurDAG->getSubtarget().getTargetTriple().getArch() ==
457 RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
458 bool IsRegSeq = true;
459 unsigned NOps = N->getNumOperands();
460 for (unsigned i = 0; i < NOps; i++) {
461 // XXX: Why is this here?
462 if (isa<RegisterSDNode>(N->getOperand(i))) {
463 IsRegSeq = false;
464 break;
465 }
466 unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
468 RegSeqArgs[1 + (2 * i)] = N->getOperand(i);
469 RegSeqArgs[1 + (2 * i) + 1] = CurDAG->getTargetConstant(Sub, DL, MVT::i32);
470 }
471 if (NOps != NumVectorElts) {
472 // Fill in the missing undef elements if this was a scalar_to_vector.
473 assert(N->getOpcode() == ISD::SCALAR_TO_VECTOR && NOps < NumVectorElts);
474 MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
475 DL, EltVT);
476 for (unsigned i = NOps; i < NumVectorElts; ++i) {
477 unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
479 RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0);
480 RegSeqArgs[1 + (2 * i) + 1] =
481 CurDAG->getTargetConstant(Sub, DL, MVT::i32);
482 }
483 }
484
485 if (!IsRegSeq)
486 SelectCode(N);
487 CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), RegSeqArgs);
488}
489
491 unsigned int Opc = N->getOpcode();
492 if (N->isMachineOpcode()) {
493 N->setNodeId(-1);
494 return; // Already selected.
495 }
496
497 // isa<MemSDNode> almost works but is slightly too permissive for some DS
498 // intrinsics.
499 if (Opc == ISD::LOAD || Opc == ISD::STORE || isa<AtomicSDNode>(N) ||
502 N = glueCopyToM0LDSInit(N);
503 SelectCode(N);
504 return;
505 }
506
507 switch (Opc) {
508 default:
509 break;
510 // We are selecting i64 ADD here instead of custom lower it during
511 // DAG legalization, so we can fold some i64 ADDs used for address
512 // calculation into the LOAD and STORE instructions.
513 case ISD::ADDC:
514 case ISD::ADDE:
515 case ISD::SUBC:
516 case ISD::SUBE: {
517 if (N->getValueType(0) != MVT::i64)
518 break;
519
520 SelectADD_SUB_I64(N);
521 return;
522 }
523 case ISD::UADDO_CARRY:
524 case ISD::USUBO_CARRY:
525 if (N->getValueType(0) != MVT::i32)
526 break;
527
528 SelectAddcSubb(N);
529 return;
530 case ISD::UADDO:
531 case ISD::USUBO: {
532 SelectUADDO_USUBO(N);
533 return;
534 }
536 SelectFMUL_W_CHAIN(N);
537 return;
538 }
540 SelectFMA_W_CHAIN(N);
541 return;
542 }
543
545 case ISD::BUILD_VECTOR: {
546 EVT VT = N->getValueType(0);
547 unsigned NumVectorElts = VT.getVectorNumElements();
548 if (VT.getScalarSizeInBits() == 16) {
549 if (Opc == ISD::BUILD_VECTOR && NumVectorElts == 2) {
550 if (SDNode *Packed = packConstantV2I16(N, *CurDAG)) {
551 ReplaceNode(N, Packed);
552 return;
553 }
554 }
555
556 break;
557 }
558
559 assert(VT.getVectorElementType().bitsEq(MVT::i32));
560 unsigned RegClassID =
561 SIRegisterInfo::getSGPRClassForBitWidth(NumVectorElts * 32)->getID();
562 SelectBuildVector(N, RegClassID);
563 return;
564 }
565 case ISD::BUILD_PAIR: {
566 SDValue RC, SubReg0, SubReg1;
567 SDLoc DL(N);
568 if (N->getValueType(0) == MVT::i128) {
569 RC = CurDAG->getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32);
570 SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32);
571 SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32);
572 } else if (N->getValueType(0) == MVT::i64) {
573 RC = CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32);
574 SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
575 SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
576 } else {
577 llvm_unreachable("Unhandled value type for BUILD_PAIR");
578 }
579 const SDValue Ops[] = { RC, N->getOperand(0), SubReg0,
580 N->getOperand(1), SubReg1 };
581 ReplaceNode(N, CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL,
582 N->getValueType(0), Ops));
583 return;
584 }
585
586 case ISD::Constant:
587 case ISD::ConstantFP: {
588 if (N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N))
589 break;
590
591 uint64_t Imm;
592 if (ConstantFPSDNode *FP = dyn_cast<ConstantFPSDNode>(N)) {
593 Imm = FP->getValueAPF().bitcastToAPInt().getZExtValue();
594 if (AMDGPU::isValid32BitLiteral(Imm, true))
595 break;
596 } else {
597 ConstantSDNode *C = cast<ConstantSDNode>(N);
598 Imm = C->getZExtValue();
599 if (AMDGPU::isValid32BitLiteral(Imm, false))
600 break;
601 }
602
603 SDLoc DL(N);
604 ReplaceNode(N, buildSMovImm64(DL, Imm, N->getValueType(0)));
605 return;
606 }
608 case AMDGPUISD::BFE_U32: {
609 // There is a scalar version available, but unlike the vector version which
610 // has a separate operand for the offset and width, the scalar version packs
611 // the width and offset into a single operand. Try to move to the scalar
612 // version if the offsets are constant, so that we can try to keep extended
613 // loads of kernel arguments in SGPRs.
614
615 // TODO: Technically we could try to pattern match scalar bitshifts of
616 // dynamic values, but it's probably not useful.
617 ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
618 if (!Offset)
619 break;
620
621 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
622 if (!Width)
623 break;
624
625 bool Signed = Opc == AMDGPUISD::BFE_I32;
626
627 uint32_t OffsetVal = Offset->getZExtValue();
628 uint32_t WidthVal = Width->getZExtValue();
629
630 ReplaceNode(N, getBFE32(Signed, SDLoc(N), N->getOperand(0), OffsetVal,
631 WidthVal));
632 return;
633 }
635 SelectDIV_SCALE(N);
636 return;
637 }
640 SelectMAD_64_32(N);
641 return;
642 }
643 case ISD::SMUL_LOHI:
644 case ISD::UMUL_LOHI:
645 return SelectMUL_LOHI(N);
646 case ISD::CopyToReg: {
648 *static_cast<const SITargetLowering*>(getTargetLowering());
649 N = Lowering.legalizeTargetIndependentNode(N, *CurDAG);
650 break;
651 }
652 case ISD::AND:
653 case ISD::SRL:
654 case ISD::SRA:
656 if (N->getValueType(0) != MVT::i32)
657 break;
658
659 SelectS_BFE(N);
660 return;
661 case ISD::BRCOND:
662 SelectBRCOND(N);
663 return;
664 case ISD::FP_EXTEND:
665 SelectFP_EXTEND(N);
666 return;
672 // Hack around using a legal type if f16 is illegal.
673 if (N->getValueType(0) == MVT::i32) {
674 MVT NewVT = Opc == AMDGPUISD::CVT_PKRTZ_F16_F32 ? MVT::v2f16 : MVT::v2i16;
675 N = CurDAG->MorphNodeTo(N, N->getOpcode(), CurDAG->getVTList(NewVT),
676 { N->getOperand(0), N->getOperand(1) });
677 SelectCode(N);
678 return;
679 }
680
681 break;
682 }
684 SelectINTRINSIC_W_CHAIN(N);
685 return;
686 }
688 SelectINTRINSIC_WO_CHAIN(N);
689 return;
690 }
691 case ISD::INTRINSIC_VOID: {
692 SelectINTRINSIC_VOID(N);
693 return;
694 }
696 SelectWAVE_ADDRESS(N);
697 return;
698 }
699 case ISD::STACKRESTORE: {
700 SelectSTACKRESTORE(N);
701 return;
702 }
703 }
704
705 SelectCode(N);
706}
707
708bool AMDGPUDAGToDAGISel::isUniformBr(const SDNode *N) const {
709 const BasicBlock *BB = FuncInfo->MBB->getBasicBlock();
710 const Instruction *Term = BB->getTerminator();
711 return Term->getMetadata("amdgpu.uniform") ||
712 Term->getMetadata("structurizecfg.uniform");
713}
714
715bool AMDGPUDAGToDAGISel::isUnneededShiftMask(const SDNode *N,
716 unsigned ShAmtBits) const {
717 assert(N->getOpcode() == ISD::AND);
718
719 const APInt &RHS = N->getConstantOperandAPInt(1);
720 if (RHS.countr_one() >= ShAmtBits)
721 return true;
722
723 const APInt &LHSKnownZeros = CurDAG->computeKnownBits(N->getOperand(0)).Zero;
724 return (LHSKnownZeros | RHS).countr_one() >= ShAmtBits;
725}
726
728 SDValue &N0, SDValue &N1) {
729 if (Addr.getValueType() == MVT::i64 && Addr.getOpcode() == ISD::BITCAST &&
730 Addr.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
731 // As we split 64-bit `or` earlier, it's complicated pattern to match, i.e.
732 // (i64 (bitcast (v2i32 (build_vector
733 // (or (extract_vector_elt V, 0), OFFSET),
734 // (extract_vector_elt V, 1)))))
735 SDValue Lo = Addr.getOperand(0).getOperand(0);
736 if (Lo.getOpcode() == ISD::OR && DAG.isBaseWithConstantOffset(Lo)) {
737 SDValue BaseLo = Lo.getOperand(0);
738 SDValue BaseHi = Addr.getOperand(0).getOperand(1);
739 // Check that split base (Lo and Hi) are extracted from the same one.
740 if (BaseLo.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
742 BaseLo.getOperand(0) == BaseHi.getOperand(0) &&
743 // Lo is statically extracted from index 0.
744 isa<ConstantSDNode>(BaseLo.getOperand(1)) &&
745 BaseLo.getConstantOperandVal(1) == 0 &&
746 // Hi is statically extracted from index 0.
747 isa<ConstantSDNode>(BaseHi.getOperand(1)) &&
748 BaseHi.getConstantOperandVal(1) == 1) {
749 N0 = BaseLo.getOperand(0).getOperand(0);
750 N1 = Lo.getOperand(1);
751 return true;
752 }
753 }
754 }
755 return false;
756}
757
758bool AMDGPUDAGToDAGISel::isBaseWithConstantOffset64(SDValue Addr, SDValue &LHS,
759 SDValue &RHS) const {
761 LHS = Addr.getOperand(0);
762 RHS = Addr.getOperand(1);
763 return true;
764 }
765
766 if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, LHS, RHS)) {
767 assert(LHS && RHS && isa<ConstantSDNode>(RHS));
768 return true;
769 }
770
771 return false;
772}
773
775 return "AMDGPU DAG->DAG Pattern Instruction Selection";
776}
777
778//===----------------------------------------------------------------------===//
779// Complex Patterns
780//===----------------------------------------------------------------------===//
781
782bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
783 SDValue &Offset) {
784 return false;
785}
786
787bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
788 SDValue &Offset) {
790 SDLoc DL(Addr);
791
792 if ((C = dyn_cast<ConstantSDNode>(Addr))) {
793 Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
794 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
795 } else if ((Addr.getOpcode() == AMDGPUISD::DWORDADDR) &&
796 (C = dyn_cast<ConstantSDNode>(Addr.getOperand(0)))) {
797 Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
798 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
799 } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
800 (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) {
801 Base = Addr.getOperand(0);
802 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
803 } else {
804 Base = Addr;
805 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
806 }
807
808 return true;
809}
810
811SDValue AMDGPUDAGToDAGISel::getMaterializedScalarImm32(int64_t Val,
812 const SDLoc &DL) const {
814 AMDGPU::S_MOV_B32, DL, MVT::i32,
815 CurDAG->getTargetConstant(Val, DL, MVT::i32));
816 return SDValue(Mov, 0);
817}
818
819// FIXME: Should only handle uaddo_carry/usubo_carry
820void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
821 SDLoc DL(N);
822 SDValue LHS = N->getOperand(0);
823 SDValue RHS = N->getOperand(1);
824
825 unsigned Opcode = N->getOpcode();
826 bool ConsumeCarry = (Opcode == ISD::ADDE || Opcode == ISD::SUBE);
827 bool ProduceCarry =
828 ConsumeCarry || Opcode == ISD::ADDC || Opcode == ISD::SUBC;
829 bool IsAdd = Opcode == ISD::ADD || Opcode == ISD::ADDC || Opcode == ISD::ADDE;
830
831 SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
832 SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
833
834 SDNode *Lo0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
835 DL, MVT::i32, LHS, Sub0);
836 SDNode *Hi0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
837 DL, MVT::i32, LHS, Sub1);
838
839 SDNode *Lo1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
840 DL, MVT::i32, RHS, Sub0);
841 SDNode *Hi1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
842 DL, MVT::i32, RHS, Sub1);
843
844 SDVTList VTList = CurDAG->getVTList(MVT::i32, MVT::Glue);
845
846 static const unsigned OpcMap[2][2][2] = {
847 {{AMDGPU::S_SUB_U32, AMDGPU::S_ADD_U32},
848 {AMDGPU::V_SUB_CO_U32_e32, AMDGPU::V_ADD_CO_U32_e32}},
849 {{AMDGPU::S_SUBB_U32, AMDGPU::S_ADDC_U32},
850 {AMDGPU::V_SUBB_U32_e32, AMDGPU::V_ADDC_U32_e32}}};
851
852 unsigned Opc = OpcMap[0][N->isDivergent()][IsAdd];
853 unsigned CarryOpc = OpcMap[1][N->isDivergent()][IsAdd];
854
855 SDNode *AddLo;
856 if (!ConsumeCarry) {
857 SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0) };
858 AddLo = CurDAG->getMachineNode(Opc, DL, VTList, Args);
859 } else {
860 SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0), N->getOperand(2) };
861 AddLo = CurDAG->getMachineNode(CarryOpc, DL, VTList, Args);
862 }
863 SDValue AddHiArgs[] = {
864 SDValue(Hi0, 0),
865 SDValue(Hi1, 0),
866 SDValue(AddLo, 1)
867 };
868 SDNode *AddHi = CurDAG->getMachineNode(CarryOpc, DL, VTList, AddHiArgs);
869
870 SDValue RegSequenceArgs[] = {
871 CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
872 SDValue(AddLo,0),
873 Sub0,
874 SDValue(AddHi,0),
875 Sub1,
876 };
877 SDNode *RegSequence = CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
878 MVT::i64, RegSequenceArgs);
879
880 if (ProduceCarry) {
881 // Replace the carry-use
882 ReplaceUses(SDValue(N, 1), SDValue(AddHi, 1));
883 }
884
885 // Replace the remaining uses.
886 ReplaceNode(N, RegSequence);
887}
888
889void AMDGPUDAGToDAGISel::SelectAddcSubb(SDNode *N) {
890 SDLoc DL(N);
891 SDValue LHS = N->getOperand(0);
892 SDValue RHS = N->getOperand(1);
893 SDValue CI = N->getOperand(2);
894
895 if (N->isDivergent()) {
896 unsigned Opc = N->getOpcode() == ISD::UADDO_CARRY ? AMDGPU::V_ADDC_U32_e64
897 : AMDGPU::V_SUBB_U32_e64;
899 N, Opc, N->getVTList(),
900 {LHS, RHS, CI,
901 CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
902 } else {
903 unsigned Opc = N->getOpcode() == ISD::UADDO_CARRY ? AMDGPU::S_ADD_CO_PSEUDO
904 : AMDGPU::S_SUB_CO_PSEUDO;
905 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), {LHS, RHS, CI});
906 }
907}
908
909void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) {
910 // The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned
911 // carry out despite the _i32 name. These were renamed in VI to _U32.
912 // FIXME: We should probably rename the opcodes here.
913 bool IsAdd = N->getOpcode() == ISD::UADDO;
914 bool IsVALU = N->isDivergent();
915
916 for (SDNode::use_iterator UI = N->use_begin(), E = N->use_end(); UI != E;
917 ++UI)
918 if (UI.getUse().getResNo() == 1) {
919 if ((IsAdd && (UI->getOpcode() != ISD::UADDO_CARRY)) ||
920 (!IsAdd && (UI->getOpcode() != ISD::USUBO_CARRY))) {
921 IsVALU = true;
922 break;
923 }
924 }
925
926 if (IsVALU) {
927 unsigned Opc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
928
930 N, Opc, N->getVTList(),
931 {N->getOperand(0), N->getOperand(1),
932 CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
933 } else {
934 unsigned Opc = N->getOpcode() == ISD::UADDO ? AMDGPU::S_UADDO_PSEUDO
935 : AMDGPU::S_USUBO_PSEUDO;
936
937 CurDAG->SelectNodeTo(N, Opc, N->getVTList(),
938 {N->getOperand(0), N->getOperand(1)});
939 }
940}
941
942void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) {
943 SDLoc SL(N);
944 // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp, omod
945 SDValue Ops[10];
946
947 SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[6], Ops[7]);
948 SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
949 SelectVOP3Mods(N->getOperand(3), Ops[5], Ops[4]);
950 Ops[8] = N->getOperand(0);
951 Ops[9] = N->getOperand(4);
952
953 // If there are no source modifiers, prefer fmac over fma because it can use
954 // the smaller VOP2 encoding.
955 bool UseFMAC = Subtarget->hasDLInsts() &&
956 cast<ConstantSDNode>(Ops[0])->isZero() &&
957 cast<ConstantSDNode>(Ops[2])->isZero() &&
958 cast<ConstantSDNode>(Ops[4])->isZero();
959 unsigned Opcode = UseFMAC ? AMDGPU::V_FMAC_F32_e64 : AMDGPU::V_FMA_F32_e64;
960 CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), Ops);
961}
962
963void AMDGPUDAGToDAGISel::SelectFMUL_W_CHAIN(SDNode *N) {
964 SDLoc SL(N);
965 // src0_modifiers, src0, src1_modifiers, src1, clamp, omod
966 SDValue Ops[8];
967
968 SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[4], Ops[5]);
969 SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
970 Ops[6] = N->getOperand(0);
971 Ops[7] = N->getOperand(3);
972
973 CurDAG->SelectNodeTo(N, AMDGPU::V_MUL_F32_e64, N->getVTList(), Ops);
974}
975
976// We need to handle this here because tablegen doesn't support matching
977// instructions with multiple outputs.
978void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
979 SDLoc SL(N);
980 EVT VT = N->getValueType(0);
981
982 assert(VT == MVT::f32 || VT == MVT::f64);
983
984 unsigned Opc
985 = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64_e64 : AMDGPU::V_DIV_SCALE_F32_e64;
986
987 // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp,
988 // omod
989 SDValue Ops[8];
990 SelectVOP3BMods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]);
991 SelectVOP3BMods(N->getOperand(1), Ops[3], Ops[2]);
992 SelectVOP3BMods(N->getOperand(2), Ops[5], Ops[4]);
993 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
994}
995
996// We need to handle this here because tablegen doesn't support matching
997// instructions with multiple outputs.
998void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {
999 SDLoc SL(N);
1000 bool Signed = N->getOpcode() == AMDGPUISD::MAD_I64_I32;
1001 unsigned Opc;
1002 if (Subtarget->hasMADIntraFwdBug())
1003 Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
1004 : AMDGPU::V_MAD_U64_U32_gfx11_e64;
1005 else
1006 Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
1007
1008 SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
1009 SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
1010 Clamp };
1011 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
1012}
1013
1014// We need to handle this here because tablegen doesn't support matching
1015// instructions with multiple outputs.
1016void AMDGPUDAGToDAGISel::SelectMUL_LOHI(SDNode *N) {
1017 SDLoc SL(N);
1018 bool Signed = N->getOpcode() == ISD::SMUL_LOHI;
1019 unsigned Opc;
1020 if (Subtarget->hasMADIntraFwdBug())
1021 Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
1022 : AMDGPU::V_MAD_U64_U32_gfx11_e64;
1023 else
1024 Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
1025
1026 SDValue Zero = CurDAG->getTargetConstant(0, SL, MVT::i64);
1027 SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
1028 SDValue Ops[] = {N->getOperand(0), N->getOperand(1), Zero, Clamp};
1029 SDNode *Mad = CurDAG->getMachineNode(Opc, SL, N->getVTList(), Ops);
1030 if (!SDValue(N, 0).use_empty()) {
1031 SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32);
1032 SDNode *Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL,
1033 MVT::i32, SDValue(Mad, 0), Sub0);
1034 ReplaceUses(SDValue(N, 0), SDValue(Lo, 0));
1035 }
1036 if (!SDValue(N, 1).use_empty()) {
1037 SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32);
1038 SDNode *Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL,
1039 MVT::i32, SDValue(Mad, 0), Sub1);
1040 ReplaceUses(SDValue(N, 1), SDValue(Hi, 0));
1041 }
1043}
1044
1045bool AMDGPUDAGToDAGISel::isDSOffsetLegal(SDValue Base, unsigned Offset) const {
1046 if (!isUInt<16>(Offset))
1047 return false;
1048
1049 if (!Base || Subtarget->hasUsableDSOffset() ||
1050 Subtarget->unsafeDSOffsetFoldingEnabled())
1051 return true;
1052
1053 // On Southern Islands instruction with a negative base value and an offset
1054 // don't seem to work.
1055 return CurDAG->SignBitIsZero(Base);
1056}
1057
1058bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
1059 SDValue &Offset) const {
1060 SDLoc DL(Addr);
1062 SDValue N0 = Addr.getOperand(0);
1063 SDValue N1 = Addr.getOperand(1);
1064 ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1065 if (isDSOffsetLegal(N0, C1->getSExtValue())) {
1066 // (add n0, c0)
1067 Base = N0;
1068 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
1069 return true;
1070 }
1071 } else if (Addr.getOpcode() == ISD::SUB) {
1072 // sub C, x -> add (sub 0, x), C
1073 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
1074 int64_t ByteOffset = C->getSExtValue();
1075 if (isDSOffsetLegal(SDValue(), ByteOffset)) {
1076 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1077
1078 // XXX - This is kind of hacky. Create a dummy sub node so we can check
1079 // the known bits in isDSOffsetLegal. We need to emit the selected node
1080 // here, so this is thrown away.
1081 SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32,
1082 Zero, Addr.getOperand(1));
1083
1084 if (isDSOffsetLegal(Sub, ByteOffset)) {
1086 Opnds.push_back(Zero);
1087 Opnds.push_back(Addr.getOperand(1));
1088
1089 // FIXME: Select to VOP3 version for with-carry.
1090 unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1091 if (Subtarget->hasAddNoCarry()) {
1092 SubOp = AMDGPU::V_SUB_U32_e64;
1093 Opnds.push_back(
1094 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
1095 }
1096
1097 MachineSDNode *MachineSub =
1098 CurDAG->getMachineNode(SubOp, DL, MVT::i32, Opnds);
1099
1100 Base = SDValue(MachineSub, 0);
1101 Offset = CurDAG->getTargetConstant(ByteOffset, DL, MVT::i16);
1102 return true;
1103 }
1104 }
1105 }
1106 } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1107 // If we have a constant address, prefer to put the constant into the
1108 // offset. This can save moves to load the constant address since multiple
1109 // operations can share the zero base address register, and enables merging
1110 // into read2 / write2 instructions.
1111
1112 SDLoc DL(Addr);
1113
1114 if (isDSOffsetLegal(SDValue(), CAddr->getZExtValue())) {
1115 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1116 MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
1117 DL, MVT::i32, Zero);
1118 Base = SDValue(MovZero, 0);
1119 Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16);
1120 return true;
1121 }
1122 }
1123
1124 // default case
1125 Base = Addr;
1126 Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i16);
1127 return true;
1128}
1129
1130bool AMDGPUDAGToDAGISel::isDSOffset2Legal(SDValue Base, unsigned Offset0,
1131 unsigned Offset1,
1132 unsigned Size) const {
1133 if (Offset0 % Size != 0 || Offset1 % Size != 0)
1134 return false;
1135 if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
1136 return false;
1137
1138 if (!Base || Subtarget->hasUsableDSOffset() ||
1139 Subtarget->unsafeDSOffsetFoldingEnabled())
1140 return true;
1141
1142 // On Southern Islands instruction with a negative base value and an offset
1143 // don't seem to work.
1144 return CurDAG->SignBitIsZero(Base);
1145}
1146
1147// Return whether the operation has NoUnsignedWrap property.
1149 return (Addr.getOpcode() == ISD::ADD &&
1150 Addr->getFlags().hasNoUnsignedWrap()) ||
1151 Addr->getOpcode() == ISD::OR;
1152}
1153
1154// Check that the base address of flat scratch load/store in the form of `base +
1155// offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware
1156// requirement). We always treat the first operand as the base address here.
1157bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue Addr) const {
1159 return true;
1160
1161 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1162 // values.
1163 if (Subtarget->hasSignedScratchOffsets())
1164 return true;
1165
1166 auto LHS = Addr.getOperand(0);
1167 auto RHS = Addr.getOperand(1);
1168
1169 // If the immediate offset is negative and within certain range, the base
1170 // address cannot also be negative. If the base is also negative, the sum
1171 // would be either negative or much larger than the valid range of scratch
1172 // memory a thread can access.
1173 ConstantSDNode *ImmOp = nullptr;
1174 if (Addr.getOpcode() == ISD::ADD && (ImmOp = dyn_cast<ConstantSDNode>(RHS))) {
1175 if (ImmOp->getSExtValue() < 0 && ImmOp->getSExtValue() > -0x40000000)
1176 return true;
1177 }
1178
1179 return CurDAG->SignBitIsZero(LHS);
1180}
1181
1182// Check address value in SGPR/VGPR are legal for flat scratch in the form
1183// of: SGPR + VGPR.
1184bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSV(SDValue Addr) const {
1186 return true;
1187
1188 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1189 // values.
1190 if (Subtarget->hasSignedScratchOffsets())
1191 return true;
1192
1193 auto LHS = Addr.getOperand(0);
1194 auto RHS = Addr.getOperand(1);
1195 return CurDAG->SignBitIsZero(RHS) && CurDAG->SignBitIsZero(LHS);
1196}
1197
1198// Check address value in SGPR/VGPR are legal for flat scratch in the form
1199// of: SGPR + VGPR + Imm.
1200bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSVImm(SDValue Addr) const {
1201 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1202 // values.
1203 if (AMDGPU::isGFX12Plus(*Subtarget))
1204 return true;
1205
1206 auto Base = Addr.getOperand(0);
1207 auto *RHSImm = cast<ConstantSDNode>(Addr.getOperand(1));
1208 // If the immediate offset is negative and within certain range, the base
1209 // address cannot also be negative. If the base is also negative, the sum
1210 // would be either negative or much larger than the valid range of scratch
1211 // memory a thread can access.
1212 if (isNoUnsignedWrap(Base) &&
1214 (RHSImm->getSExtValue() < 0 && RHSImm->getSExtValue() > -0x40000000)))
1215 return true;
1216
1217 auto LHS = Base.getOperand(0);
1218 auto RHS = Base.getOperand(1);
1219 return CurDAG->SignBitIsZero(RHS) && CurDAG->SignBitIsZero(LHS);
1220}
1221
1222// TODO: If offset is too big, put low 16-bit into offset.
1223bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,
1224 SDValue &Offset0,
1225 SDValue &Offset1) const {
1226 return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 4);
1227}
1228
1229bool AMDGPUDAGToDAGISel::SelectDS128Bit8ByteAligned(SDValue Addr, SDValue &Base,
1230 SDValue &Offset0,
1231 SDValue &Offset1) const {
1232 return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 8);
1233}
1234
1235bool AMDGPUDAGToDAGISel::SelectDSReadWrite2(SDValue Addr, SDValue &Base,
1236 SDValue &Offset0, SDValue &Offset1,
1237 unsigned Size) const {
1238 SDLoc DL(Addr);
1239
1241 SDValue N0 = Addr.getOperand(0);
1242 SDValue N1 = Addr.getOperand(1);
1243 ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1244 unsigned OffsetValue0 = C1->getZExtValue();
1245 unsigned OffsetValue1 = OffsetValue0 + Size;
1246
1247 // (add n0, c0)
1248 if (isDSOffset2Legal(N0, OffsetValue0, OffsetValue1, Size)) {
1249 Base = N0;
1250 Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i8);
1251 Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i8);
1252 return true;
1253 }
1254 } else if (Addr.getOpcode() == ISD::SUB) {
1255 // sub C, x -> add (sub 0, x), C
1256 if (const ConstantSDNode *C =
1257 dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
1258 unsigned OffsetValue0 = C->getZExtValue();
1259 unsigned OffsetValue1 = OffsetValue0 + Size;
1260
1261 if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {
1262 SDLoc DL(Addr);
1263 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1264
1265 // XXX - This is kind of hacky. Create a dummy sub node so we can check
1266 // the known bits in isDSOffsetLegal. We need to emit the selected node
1267 // here, so this is thrown away.
1268 SDValue Sub =
1269 CurDAG->getNode(ISD::SUB, DL, MVT::i32, Zero, Addr.getOperand(1));
1270
1271 if (isDSOffset2Legal(Sub, OffsetValue0, OffsetValue1, Size)) {
1273 Opnds.push_back(Zero);
1274 Opnds.push_back(Addr.getOperand(1));
1275 unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1276 if (Subtarget->hasAddNoCarry()) {
1277 SubOp = AMDGPU::V_SUB_U32_e64;
1278 Opnds.push_back(
1279 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
1280 }
1281
1282 MachineSDNode *MachineSub = CurDAG->getMachineNode(
1283 SubOp, DL, MVT::getIntegerVT(Size * 8), Opnds);
1284
1285 Base = SDValue(MachineSub, 0);
1286 Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i8);
1287 Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i8);
1288 return true;
1289 }
1290 }
1291 }
1292 } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1293 unsigned OffsetValue0 = CAddr->getZExtValue();
1294 unsigned OffsetValue1 = OffsetValue0 + Size;
1295
1296 if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {
1297 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1298 MachineSDNode *MovZero =
1299 CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32, Zero);
1300 Base = SDValue(MovZero, 0);
1301 Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i8);
1302 Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i8);
1303 return true;
1304 }
1305 }
1306
1307 // default case
1308
1309 Base = Addr;
1310 Offset0 = CurDAG->getTargetConstant(0, DL, MVT::i8);
1311 Offset1 = CurDAG->getTargetConstant(1, DL, MVT::i8);
1312 return true;
1313}
1314
1315bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDValue &VAddr,
1316 SDValue &SOffset, SDValue &Offset,
1317 SDValue &Offen, SDValue &Idxen,
1318 SDValue &Addr64) const {
1319 // Subtarget prefers to use flat instruction
1320 // FIXME: This should be a pattern predicate and not reach here
1321 if (Subtarget->useFlatForGlobal())
1322 return false;
1323
1324 SDLoc DL(Addr);
1325
1326 Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1);
1327 Offen = CurDAG->getTargetConstant(0, DL, MVT::i1);
1328 Addr64 = CurDAG->getTargetConstant(0, DL, MVT::i1);
1329 SOffset = Subtarget->hasRestrictedSOffset()
1330 ? CurDAG->getRegister(AMDGPU::SGPR_NULL, MVT::i32)
1331 : CurDAG->getTargetConstant(0, DL, MVT::i32);
1332
1333 ConstantSDNode *C1 = nullptr;
1334 SDValue N0 = Addr;
1336 C1 = cast<ConstantSDNode>(Addr.getOperand(1));
1337 if (isUInt<32>(C1->getZExtValue()))
1338 N0 = Addr.getOperand(0);
1339 else
1340 C1 = nullptr;
1341 }
1342
1343 if (N0.getOpcode() == ISD::ADD) {
1344 // (add N2, N3) -> addr64, or
1345 // (add (add N2, N3), C1) -> addr64
1346 SDValue N2 = N0.getOperand(0);
1347 SDValue N3 = N0.getOperand(1);
1348 Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
1349
1350 if (N2->isDivergent()) {
1351 if (N3->isDivergent()) {
1352 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
1353 // addr64, and construct the resource from a 0 address.
1354 Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
1355 VAddr = N0;
1356 } else {
1357 // N2 is divergent, N3 is not.
1358 Ptr = N3;
1359 VAddr = N2;
1360 }
1361 } else {
1362 // N2 is not divergent.
1363 Ptr = N2;
1364 VAddr = N3;
1365 }
1366 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1367 } else if (N0->isDivergent()) {
1368 // N0 is divergent. Use it as the addr64, and construct the resource from a
1369 // 0 address.
1370 Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
1371 VAddr = N0;
1372 Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
1373 } else {
1374 // N0 -> offset, or
1375 // (N0 + C1) -> offset
1376 VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32);
1377 Ptr = N0;
1378 }
1379
1380 if (!C1) {
1381 // No offset.
1382 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1383 return true;
1384 }
1385
1386 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1387 if (TII->isLegalMUBUFImmOffset(C1->getZExtValue())) {
1388 // Legal offset for instruction.
1389 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
1390 return true;
1391 }
1392
1393 // Illegal offset, store it in soffset.
1394 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1395 SOffset =
1397 AMDGPU::S_MOV_B32, DL, MVT::i32,
1398 CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32)),
1399 0);
1400 return true;
1401}
1402
1403bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
1404 SDValue &VAddr, SDValue &SOffset,
1405 SDValue &Offset) const {
1406 SDValue Ptr, Offen, Idxen, Addr64;
1407
1408 // addr64 bit was removed for volcanic islands.
1409 // FIXME: This should be a pattern predicate and not reach here
1410 if (!Subtarget->hasAddr64())
1411 return false;
1412
1413 if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
1414 return false;
1415
1416 ConstantSDNode *C = cast<ConstantSDNode>(Addr64);
1417 if (C->getSExtValue()) {
1418 SDLoc DL(Addr);
1419
1420 const SITargetLowering& Lowering =
1421 *static_cast<const SITargetLowering*>(getTargetLowering());
1422
1423 SRsrc = SDValue(Lowering.wrapAddr64Rsrc(*CurDAG, DL, Ptr), 0);
1424 return true;
1425 }
1426
1427 return false;
1428}
1429
1430std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const {
1431 SDLoc DL(N);
1432
1433 auto *FI = dyn_cast<FrameIndexSDNode>(N);
1434 SDValue TFI =
1435 FI ? CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0)) : N;
1436
1437 // We rebase the base address into an absolute stack address and hence
1438 // use constant 0 for soffset. This value must be retained until
1439 // frame elimination and eliminateFrameIndex will choose the appropriate
1440 // frame register if need be.
1441 return std::pair(TFI, CurDAG->getTargetConstant(0, DL, MVT::i32));
1442}
1443
1444bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent,
1445 SDValue Addr, SDValue &Rsrc,
1446 SDValue &VAddr, SDValue &SOffset,
1447 SDValue &ImmOffset) const {
1448
1449 SDLoc DL(Addr);
1452
1453 Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1454
1455 if (ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1456 int64_t Imm = CAddr->getSExtValue();
1457 const int64_t NullPtr =
1459 // Don't fold null pointer.
1460 if (Imm != NullPtr) {
1461 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
1462 SDValue HighBits =
1463 CurDAG->getTargetConstant(Imm & ~MaxOffset, DL, MVT::i32);
1464 MachineSDNode *MovHighBits = CurDAG->getMachineNode(
1465 AMDGPU::V_MOV_B32_e32, DL, MVT::i32, HighBits);
1466 VAddr = SDValue(MovHighBits, 0);
1467
1468 SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1469 ImmOffset = CurDAG->getTargetConstant(Imm & MaxOffset, DL, MVT::i32);
1470 return true;
1471 }
1472 }
1473
1475 // (add n0, c1)
1476
1477 SDValue N0 = Addr.getOperand(0);
1478 uint64_t C1 = Addr.getConstantOperandVal(1);
1479
1480 // Offsets in vaddr must be positive if range checking is enabled.
1481 //
1482 // The total computation of vaddr + soffset + offset must not overflow. If
1483 // vaddr is negative, even if offset is 0 the sgpr offset add will end up
1484 // overflowing.
1485 //
1486 // Prior to gfx9, MUBUF instructions with the vaddr offset enabled would
1487 // always perform a range check. If a negative vaddr base index was used,
1488 // this would fail the range check. The overall address computation would
1489 // compute a valid address, but this doesn't happen due to the range
1490 // check. For out-of-bounds MUBUF loads, a 0 is returned.
1491 //
1492 // Therefore it should be safe to fold any VGPR offset on gfx9 into the
1493 // MUBUF vaddr, but not on older subtargets which can only do this if the
1494 // sign bit is known 0.
1495 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1496 if (TII->isLegalMUBUFImmOffset(C1) &&
1497 (!Subtarget->privateMemoryResourceIsRangeChecked() ||
1498 CurDAG->SignBitIsZero(N0))) {
1499 std::tie(VAddr, SOffset) = foldFrameIndex(N0);
1500 ImmOffset = CurDAG->getTargetConstant(C1, DL, MVT::i32);
1501 return true;
1502 }
1503 }
1504
1505 // (node)
1506 std::tie(VAddr, SOffset) = foldFrameIndex(Addr);
1507 ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1508 return true;
1509}
1510
1511static bool IsCopyFromSGPR(const SIRegisterInfo &TRI, SDValue Val) {
1512 if (Val.getOpcode() != ISD::CopyFromReg)
1513 return false;
1514 auto Reg = cast<RegisterSDNode>(Val.getOperand(1))->getReg();
1515 if (!Reg.isPhysical())
1516 return false;
1517 auto RC = TRI.getPhysRegBaseClass(Reg);
1518 return RC && TRI.isSGPRClass(RC);
1519}
1520
1521bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent,
1522 SDValue Addr,
1523 SDValue &SRsrc,
1524 SDValue &SOffset,
1525 SDValue &Offset) const {
1526 const SIRegisterInfo *TRI =
1527 static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
1528 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1531 SDLoc DL(Addr);
1532
1533 // CopyFromReg <sgpr>
1534 if (IsCopyFromSGPR(*TRI, Addr)) {
1535 SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1536 SOffset = Addr;
1537 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1538 return true;
1539 }
1540
1541 ConstantSDNode *CAddr;
1542 if (Addr.getOpcode() == ISD::ADD) {
1543 // Add (CopyFromReg <sgpr>) <constant>
1544 CAddr = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
1545 if (!CAddr || !TII->isLegalMUBUFImmOffset(CAddr->getZExtValue()))
1546 return false;
1547 if (!IsCopyFromSGPR(*TRI, Addr.getOperand(0)))
1548 return false;
1549
1550 SOffset = Addr.getOperand(0);
1551 } else if ((CAddr = dyn_cast<ConstantSDNode>(Addr)) &&
1552 TII->isLegalMUBUFImmOffset(CAddr->getZExtValue())) {
1553 // <constant>
1554 SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1555 } else {
1556 return false;
1557 }
1558
1559 SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1560
1561 Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i32);
1562 return true;
1563}
1564
1565bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
1566 SDValue &SOffset, SDValue &Offset
1567 ) const {
1568 SDValue Ptr, VAddr, Offen, Idxen, Addr64;
1569 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1570
1571 if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
1572 return false;
1573
1574 if (!cast<ConstantSDNode>(Offen)->getSExtValue() &&
1575 !cast<ConstantSDNode>(Idxen)->getSExtValue() &&
1576 !cast<ConstantSDNode>(Addr64)->getSExtValue()) {
1577 uint64_t Rsrc = TII->getDefaultRsrcDataFormat() |
1578 APInt::getAllOnes(32).getZExtValue(); // Size
1579 SDLoc DL(Addr);
1580
1581 const SITargetLowering& Lowering =
1582 *static_cast<const SITargetLowering*>(getTargetLowering());
1583
1584 SRsrc = SDValue(Lowering.buildRSRC(*CurDAG, DL, Ptr, 0, Rsrc), 0);
1585 return true;
1586 }
1587 return false;
1588}
1589
1590bool AMDGPUDAGToDAGISel::SelectBUFSOffset(SDValue ByteOffsetNode,
1591 SDValue &SOffset) const {
1592 if (Subtarget->hasRestrictedSOffset() && isNullConstant(ByteOffsetNode)) {
1593 SOffset = CurDAG->getRegister(AMDGPU::SGPR_NULL, MVT::i32);
1594 return true;
1595 }
1596
1597 SOffset = ByteOffsetNode;
1598 return true;
1599}
1600
1601// Find a load or store from corresponding pattern root.
1602// Roots may be build_vector, bitconvert or their combinations.
1605 if (MemSDNode *MN = dyn_cast<MemSDNode>(N))
1606 return MN;
1607 assert(isa<BuildVectorSDNode>(N));
1608 for (SDValue V : N->op_values())
1609 if (MemSDNode *MN =
1610 dyn_cast<MemSDNode>(AMDGPUTargetLowering::stripBitcast(V)))
1611 return MN;
1612 llvm_unreachable("cannot find MemSDNode in the pattern!");
1613}
1614
1615bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr,
1616 SDValue &VAddr, SDValue &Offset,
1617 uint64_t FlatVariant) const {
1618 int64_t OffsetVal = 0;
1619
1620 unsigned AS = findMemSDNode(N)->getAddressSpace();
1621
1622 bool CanHaveFlatSegmentOffsetBug =
1623 Subtarget->hasFlatSegmentOffsetBug() &&
1624 FlatVariant == SIInstrFlags::FLAT &&
1626
1627 if (Subtarget->hasFlatInstOffsets() && !CanHaveFlatSegmentOffsetBug) {
1628 SDValue N0, N1;
1629 if (isBaseWithConstantOffset64(Addr, N0, N1) &&
1630 (FlatVariant != SIInstrFlags::FlatScratch ||
1631 isFlatScratchBaseLegal(Addr))) {
1632 int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
1633
1634 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1635 if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) {
1636 Addr = N0;
1637 OffsetVal = COffsetVal;
1638 } else {
1639 // If the offset doesn't fit, put the low bits into the offset field and
1640 // add the rest.
1641 //
1642 // For a FLAT instruction the hardware decides whether to access
1643 // global/scratch/shared memory based on the high bits of vaddr,
1644 // ignoring the offset field, so we have to ensure that when we add
1645 // remainder to vaddr it still points into the same underlying object.
1646 // The easiest way to do that is to make sure that we split the offset
1647 // into two pieces that are both >= 0 or both <= 0.
1648
1649 SDLoc DL(N);
1650 uint64_t RemainderOffset;
1651
1652 std::tie(OffsetVal, RemainderOffset) =
1653 TII->splitFlatOffset(COffsetVal, AS, FlatVariant);
1654
1655 SDValue AddOffsetLo =
1656 getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
1657 SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
1658
1659 if (Addr.getValueType().getSizeInBits() == 32) {
1661 Opnds.push_back(N0);
1662 Opnds.push_back(AddOffsetLo);
1663 unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32;
1664 if (Subtarget->hasAddNoCarry()) {
1665 AddOp = AMDGPU::V_ADD_U32_e64;
1666 Opnds.push_back(Clamp);
1667 }
1668 Addr = SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0);
1669 } else {
1670 // TODO: Should this try to use a scalar add pseudo if the base address
1671 // is uniform and saddr is usable?
1672 SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
1673 SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
1674
1675 SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1676 DL, MVT::i32, N0, Sub0);
1677 SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1678 DL, MVT::i32, N0, Sub1);
1679
1680 SDValue AddOffsetHi =
1681 getMaterializedScalarImm32(Hi_32(RemainderOffset), DL);
1682
1683 SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1);
1684
1685 SDNode *Add =
1686 CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs,
1687 {AddOffsetLo, SDValue(N0Lo, 0), Clamp});
1688
1689 SDNode *Addc = CurDAG->getMachineNode(
1690 AMDGPU::V_ADDC_U32_e64, DL, VTs,
1691 {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp});
1692
1693 SDValue RegSequenceArgs[] = {
1694 CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32),
1695 SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1};
1696
1697 Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
1698 MVT::i64, RegSequenceArgs),
1699 0);
1700 }
1701 }
1702 }
1703 }
1704
1705 VAddr = Addr;
1706 Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i32);
1707 return true;
1708}
1709
1710bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N, SDValue Addr,
1711 SDValue &VAddr,
1712 SDValue &Offset) const {
1713 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, SIInstrFlags::FLAT);
1714}
1715
1716bool AMDGPUDAGToDAGISel::SelectGlobalOffset(SDNode *N, SDValue Addr,
1717 SDValue &VAddr,
1718 SDValue &Offset) const {
1719 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, SIInstrFlags::FlatGlobal);
1720}
1721
1722bool AMDGPUDAGToDAGISel::SelectScratchOffset(SDNode *N, SDValue Addr,
1723 SDValue &VAddr,
1724 SDValue &Offset) const {
1725 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset,
1727}
1728
1729// If this matches zero_extend i32:x, return x
1731 if (Op.getOpcode() != ISD::ZERO_EXTEND)
1732 return SDValue();
1733
1734 SDValue ExtSrc = Op.getOperand(0);
1735 return (ExtSrc.getValueType() == MVT::i32) ? ExtSrc : SDValue();
1736}
1737
1738// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
1739bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
1740 SDValue Addr,
1741 SDValue &SAddr,
1742 SDValue &VOffset,
1743 SDValue &Offset) const {
1744 int64_t ImmOffset = 0;
1745
1746 // Match the immediate offset first, which canonically is moved as low as
1747 // possible.
1748
1749 SDValue LHS, RHS;
1750 if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
1751 int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
1752 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1753
1754 if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS,
1756 Addr = LHS;
1757 ImmOffset = COffsetVal;
1758 } else if (!LHS->isDivergent()) {
1759 if (COffsetVal > 0) {
1760 SDLoc SL(N);
1761 // saddr + large_offset -> saddr +
1762 // (voffset = large_offset & ~MaxOffset) +
1763 // (large_offset & MaxOffset);
1764 int64_t SplitImmOffset, RemainderOffset;
1765 std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
1767
1768 if (isUInt<32>(RemainderOffset)) {
1769 SDNode *VMov = CurDAG->getMachineNode(
1770 AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
1771 CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
1772 VOffset = SDValue(VMov, 0);
1773 SAddr = LHS;
1774 Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i32);
1775 return true;
1776 }
1777 }
1778
1779 // We are adding a 64 bit SGPR and a constant. If constant bus limit
1780 // is 1 we would need to perform 1 or 2 extra moves for each half of
1781 // the constant and it is better to do a scalar add and then issue a
1782 // single VALU instruction to materialize zero. Otherwise it is less
1783 // instructions to perform VALU adds with immediates or inline literals.
1784 unsigned NumLiterals =
1785 !TII->isInlineConstant(APInt(32, COffsetVal & 0xffffffff)) +
1786 !TII->isInlineConstant(APInt(32, COffsetVal >> 32));
1787 if (Subtarget->getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
1788 return false;
1789 }
1790 }
1791
1792 // Match the variable offset.
1793 if (Addr.getOpcode() == ISD::ADD) {
1794 LHS = Addr.getOperand(0);
1795 RHS = Addr.getOperand(1);
1796
1797 if (!LHS->isDivergent()) {
1798 // add (i64 sgpr), (zero_extend (i32 vgpr))
1799 if (SDValue ZextRHS = matchZExtFromI32(RHS)) {
1800 SAddr = LHS;
1801 VOffset = ZextRHS;
1802 }
1803 }
1804
1805 if (!SAddr && !RHS->isDivergent()) {
1806 // add (zero_extend (i32 vgpr)), (i64 sgpr)
1807 if (SDValue ZextLHS = matchZExtFromI32(LHS)) {
1808 SAddr = RHS;
1809 VOffset = ZextLHS;
1810 }
1811 }
1812
1813 if (SAddr) {
1814 Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i32);
1815 return true;
1816 }
1817 }
1818
1819 if (Addr->isDivergent() || Addr.getOpcode() == ISD::UNDEF ||
1820 isa<ConstantSDNode>(Addr))
1821 return false;
1822
1823 // It's cheaper to materialize a single 32-bit zero for vaddr than the two
1824 // moves required to copy a 64-bit SGPR to VGPR.
1825 SAddr = Addr;
1826 SDNode *VMov =
1827 CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, SDLoc(Addr), MVT::i32,
1828 CurDAG->getTargetConstant(0, SDLoc(), MVT::i32));
1829 VOffset = SDValue(VMov, 0);
1830 Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i32);
1831 return true;
1832}
1833
1835 if (auto FI = dyn_cast<FrameIndexSDNode>(SAddr)) {
1836 SAddr = CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0));
1837 } else if (SAddr.getOpcode() == ISD::ADD &&
1838 isa<FrameIndexSDNode>(SAddr.getOperand(0))) {
1839 // Materialize this into a scalar move for scalar address to avoid
1840 // readfirstlane.
1841 auto FI = cast<FrameIndexSDNode>(SAddr.getOperand(0));
1842 SDValue TFI = CurDAG->getTargetFrameIndex(FI->getIndex(),
1843 FI->getValueType(0));
1844 SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, SDLoc(SAddr),
1845 MVT::i32, TFI, SAddr.getOperand(1)),
1846 0);
1847 }
1848
1849 return SAddr;
1850}
1851
1852// Match (32-bit SGPR base) + sext(imm offset)
1853bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr,
1854 SDValue &SAddr,
1855 SDValue &Offset) const {
1856 if (Addr->isDivergent())
1857 return false;
1858
1859 SDLoc DL(Addr);
1860
1861 int64_t COffsetVal = 0;
1862
1863 if (CurDAG->isBaseWithConstantOffset(Addr) && isFlatScratchBaseLegal(Addr)) {
1864 COffsetVal = cast<ConstantSDNode>(Addr.getOperand(1))->getSExtValue();
1865 SAddr = Addr.getOperand(0);
1866 } else {
1867 SAddr = Addr;
1868 }
1869
1870 SAddr = SelectSAddrFI(CurDAG, SAddr);
1871
1872 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1873
1874 if (!TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS,
1876 int64_t SplitImmOffset, RemainderOffset;
1877 std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
1879
1880 COffsetVal = SplitImmOffset;
1881
1882 SDValue AddOffset =
1884 ? getMaterializedScalarImm32(Lo_32(RemainderOffset), DL)
1885 : CurDAG->getTargetConstant(RemainderOffset, DL, MVT::i32);
1886 SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, DL, MVT::i32,
1887 SAddr, AddOffset),
1888 0);
1889 }
1890
1891 Offset = CurDAG->getTargetConstant(COffsetVal, DL, MVT::i16);
1892
1893 return true;
1894}
1895
1896// Check whether the flat scratch SVS swizzle bug affects this access.
1897bool AMDGPUDAGToDAGISel::checkFlatScratchSVSSwizzleBug(
1898 SDValue VAddr, SDValue SAddr, uint64_t ImmOffset) const {
1899 if (!Subtarget->hasFlatScratchSVSSwizzleBug())
1900 return false;
1901
1902 // The bug affects the swizzling of SVS accesses if there is any carry out
1903 // from the two low order bits (i.e. from bit 1 into bit 2) when adding
1904 // voffset to (soffset + inst_offset).
1905 KnownBits VKnown = CurDAG->computeKnownBits(VAddr);
1907 /*Add=*/true, /*NSW=*/false, /*NUW=*/false,
1908 CurDAG->computeKnownBits(SAddr),
1909 KnownBits::makeConstant(APInt(32, ImmOffset)));
1910 uint64_t VMax = VKnown.getMaxValue().getZExtValue();
1912 return (VMax & 3) + (SMax & 3) >= 4;
1913}
1914
1915bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
1916 SDValue &VAddr, SDValue &SAddr,
1917 SDValue &Offset) const {
1918 int64_t ImmOffset = 0;
1919
1920 SDValue LHS, RHS;
1921 SDValue OrigAddr = Addr;
1922 if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
1923 int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
1924 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1925
1926 if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, true)) {
1927 Addr = LHS;
1928 ImmOffset = COffsetVal;
1929 } else if (!LHS->isDivergent() && COffsetVal > 0) {
1930 SDLoc SL(N);
1931 // saddr + large_offset -> saddr + (vaddr = large_offset & ~MaxOffset) +
1932 // (large_offset & MaxOffset);
1933 int64_t SplitImmOffset, RemainderOffset;
1934 std::tie(SplitImmOffset, RemainderOffset)
1935 = TII->splitFlatOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, true);
1936
1937 if (isUInt<32>(RemainderOffset)) {
1938 SDNode *VMov = CurDAG->getMachineNode(
1939 AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
1940 CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
1941 VAddr = SDValue(VMov, 0);
1942 SAddr = LHS;
1943 if (!isFlatScratchBaseLegal(Addr))
1944 return false;
1945 if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, SplitImmOffset))
1946 return false;
1947 Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i16);
1948 return true;
1949 }
1950 }
1951 }
1952
1953 if (Addr.getOpcode() != ISD::ADD)
1954 return false;
1955
1956 LHS = Addr.getOperand(0);
1957 RHS = Addr.getOperand(1);
1958
1959 if (!LHS->isDivergent() && RHS->isDivergent()) {
1960 SAddr = LHS;
1961 VAddr = RHS;
1962 } else if (!RHS->isDivergent() && LHS->isDivergent()) {
1963 SAddr = RHS;
1964 VAddr = LHS;
1965 } else {
1966 return false;
1967 }
1968
1969 if (OrigAddr != Addr) {
1970 if (!isFlatScratchBaseLegalSVImm(OrigAddr))
1971 return false;
1972 } else {
1973 if (!isFlatScratchBaseLegalSV(OrigAddr))
1974 return false;
1975 }
1976
1977 if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, ImmOffset))
1978 return false;
1979 SAddr = SelectSAddrFI(CurDAG, SAddr);
1980 Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16);
1981 return true;
1982}
1983
1984// Match an immediate (if Offset is not null) or an SGPR (if SOffset is
1985// not null) offset. If Imm32Only is true, match only 32-bit immediate
1986// offsets available on CI.
1987bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
1988 SDValue *SOffset, SDValue *Offset,
1989 bool Imm32Only, bool IsBuffer) const {
1990 assert((!SOffset || !Offset) &&
1991 "Cannot match both soffset and offset at the same time!");
1992
1993 ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode);
1994 if (!C) {
1995 if (!SOffset)
1996 return false;
1997 if (ByteOffsetNode.getValueType().isScalarInteger() &&
1998 ByteOffsetNode.getValueType().getSizeInBits() == 32) {
1999 *SOffset = ByteOffsetNode;
2000 return true;
2001 }
2002 if (ByteOffsetNode.getOpcode() == ISD::ZERO_EXTEND) {
2003 if (ByteOffsetNode.getOperand(0).getValueType().getSizeInBits() == 32) {
2004 *SOffset = ByteOffsetNode.getOperand(0);
2005 return true;
2006 }
2007 }
2008 return false;
2009 }
2010
2011 SDLoc SL(ByteOffsetNode);
2012
2013 // GFX9 and GFX10 have signed byte immediate offsets. The immediate
2014 // offset for S_BUFFER instructions is unsigned.
2015 int64_t ByteOffset = IsBuffer ? C->getZExtValue() : C->getSExtValue();
2016 std::optional<int64_t> EncodedOffset =
2017 AMDGPU::getSMRDEncodedOffset(*Subtarget, ByteOffset, IsBuffer);
2018 if (EncodedOffset && Offset && !Imm32Only) {
2019 *Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);
2020 return true;
2021 }
2022
2023 // SGPR and literal offsets are unsigned.
2024 if (ByteOffset < 0)
2025 return false;
2026
2027 EncodedOffset = AMDGPU::getSMRDEncodedLiteralOffset32(*Subtarget, ByteOffset);
2028 if (EncodedOffset && Offset && Imm32Only) {
2029 *Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);
2030 return true;
2031 }
2032
2033 if (!isUInt<32>(ByteOffset) && !isInt<32>(ByteOffset))
2034 return false;
2035
2036 if (SOffset) {
2037 SDValue C32Bit = CurDAG->getTargetConstant(ByteOffset, SL, MVT::i32);
2038 *SOffset = SDValue(
2039 CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, C32Bit), 0);
2040 return true;
2041 }
2042
2043 return false;
2044}
2045
2046SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const {
2047 if (Addr.getValueType() != MVT::i32)
2048 return Addr;
2049
2050 // Zero-extend a 32-bit address.
2051 SDLoc SL(Addr);
2052
2055 unsigned AddrHiVal = Info->get32BitAddressHighBits();
2056 SDValue AddrHi = CurDAG->getTargetConstant(AddrHiVal, SL, MVT::i32);
2057
2058 const SDValue Ops[] = {
2059 CurDAG->getTargetConstant(AMDGPU::SReg_64_XEXECRegClassID, SL, MVT::i32),
2060 Addr,
2061 CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
2062 SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, AddrHi),
2063 0),
2064 CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32),
2065 };
2066
2067 return SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, SL, MVT::i64,
2068 Ops), 0);
2069}
2070
2071// Match a base and an immediate (if Offset is not null) or an SGPR (if
2072// SOffset is not null) or an immediate+SGPR offset. If Imm32Only is
2073// true, match only 32-bit immediate offsets available on CI.
2074bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase,
2075 SDValue *SOffset, SDValue *Offset,
2076 bool Imm32Only,
2077 bool IsBuffer) const {
2078 if (SOffset && Offset) {
2079 assert(!Imm32Only && !IsBuffer);
2080 SDValue B;
2081 return SelectSMRDBaseOffset(Addr, B, nullptr, Offset) &&
2082 SelectSMRDBaseOffset(B, SBase, SOffset, nullptr);
2083 }
2084
2085 // A 32-bit (address + offset) should not cause unsigned 32-bit integer
2086 // wraparound, because s_load instructions perform the addition in 64 bits.
2087 if (Addr.getValueType() == MVT::i32 && Addr.getOpcode() == ISD::ADD &&
2088 !Addr->getFlags().hasNoUnsignedWrap())
2089 return false;
2090
2091 SDValue N0, N1;
2092 // Extract the base and offset if possible.
2093 if (CurDAG->isBaseWithConstantOffset(Addr) || Addr.getOpcode() == ISD::ADD) {
2094 N0 = Addr.getOperand(0);
2095 N1 = Addr.getOperand(1);
2096 } else if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, N0, N1)) {
2097 assert(N0 && N1 && isa<ConstantSDNode>(N1));
2098 }
2099 if (!N0 || !N1)
2100 return false;
2101 if (SelectSMRDOffset(N1, SOffset, Offset, Imm32Only, IsBuffer)) {
2102 SBase = N0;
2103 return true;
2104 }
2105 if (SelectSMRDOffset(N0, SOffset, Offset, Imm32Only, IsBuffer)) {
2106 SBase = N1;
2107 return true;
2108 }
2109 return false;
2110}
2111
2112bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase,
2113 SDValue *SOffset, SDValue *Offset,
2114 bool Imm32Only) const {
2115 if (SelectSMRDBaseOffset(Addr, SBase, SOffset, Offset, Imm32Only)) {
2116 SBase = Expand32BitAddress(SBase);
2117 return true;
2118 }
2119
2120 if (Addr.getValueType() == MVT::i32 && Offset && !SOffset) {
2121 SBase = Expand32BitAddress(Addr);
2122 *Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
2123 return true;
2124 }
2125
2126 return false;
2127}
2128
2129bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr, SDValue &SBase,
2130 SDValue &Offset) const {
2131 return SelectSMRD(Addr, SBase, /* SOffset */ nullptr, &Offset);
2132}
2133
2134bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase,
2135 SDValue &Offset) const {
2137 return SelectSMRD(Addr, SBase, /* SOffset */ nullptr, &Offset,
2138 /* Imm32Only */ true);
2139}
2140
2141bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDValue Addr, SDValue &SBase,
2142 SDValue &SOffset) const {
2143 return SelectSMRD(Addr, SBase, &SOffset, /* Offset */ nullptr);
2144}
2145
2146bool AMDGPUDAGToDAGISel::SelectSMRDSgprImm(SDValue Addr, SDValue &SBase,
2147 SDValue &SOffset,
2148 SDValue &Offset) const {
2149 return SelectSMRD(Addr, SBase, &SOffset, &Offset);
2150}
2151
2152bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue N, SDValue &Offset) const {
2153 return SelectSMRDOffset(N, /* SOffset */ nullptr, &Offset,
2154 /* Imm32Only */ false, /* IsBuffer */ true);
2155}
2156
2157bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue N,
2158 SDValue &Offset) const {
2160 return SelectSMRDOffset(N, /* SOffset */ nullptr, &Offset,
2161 /* Imm32Only */ true, /* IsBuffer */ true);
2162}
2163
2164bool AMDGPUDAGToDAGISel::SelectSMRDBufferSgprImm(SDValue N, SDValue &SOffset,
2165 SDValue &Offset) const {
2166 // Match the (soffset + offset) pair as a 32-bit register base and
2167 // an immediate offset.
2168 return N.getValueType() == MVT::i32 &&
2169 SelectSMRDBaseOffset(N, /* SBase */ SOffset, /* SOffset*/ nullptr,
2170 &Offset, /* Imm32Only */ false,
2171 /* IsBuffer */ true);
2172}
2173
2174bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index,
2175 SDValue &Base,
2176 SDValue &Offset) const {
2177 SDLoc DL(Index);
2178
2180 SDValue N0 = Index.getOperand(0);
2181 SDValue N1 = Index.getOperand(1);
2182 ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
2183
2184 // (add n0, c0)
2185 // Don't peel off the offset (c0) if doing so could possibly lead
2186 // the base (n0) to be negative.
2187 // (or n0, |c0|) can never change a sign given isBaseWithConstantOffset.
2188 if (C1->getSExtValue() <= 0 || CurDAG->SignBitIsZero(N0) ||
2189 (Index->getOpcode() == ISD::OR && C1->getSExtValue() >= 0)) {
2190 Base = N0;
2191 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
2192 return true;
2193 }
2194 }
2195
2196 if (isa<ConstantSDNode>(Index))
2197 return false;
2198
2199 Base = Index;
2200 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
2201 return true;
2202}
2203
2204SDNode *AMDGPUDAGToDAGISel::getBFE32(bool IsSigned, const SDLoc &DL,
2205 SDValue Val, uint32_t Offset,
2206 uint32_t Width) {
2207 if (Val->isDivergent()) {
2208 unsigned Opcode = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2210 SDValue W = CurDAG->getTargetConstant(Width, DL, MVT::i32);
2211
2212 return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, Off, W);
2213 }
2214 unsigned Opcode = IsSigned ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2215 // Transformation function, pack the offset and width of a BFE into
2216 // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
2217 // source, bits [5:0] contain the offset and bits [22:16] the width.
2218 uint32_t PackedVal = Offset | (Width << 16);
2219 SDValue PackedConst = CurDAG->getTargetConstant(PackedVal, DL, MVT::i32);
2220
2221 return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, PackedConst);
2222}
2223
2224void AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode *N) {
2225 // "(a << b) srl c)" ---> "BFE_U32 a, (c-b), (32-c)
2226 // "(a << b) sra c)" ---> "BFE_I32 a, (c-b), (32-c)
2227 // Predicate: 0 < b <= c < 32
2228
2229 const SDValue &Shl = N->getOperand(0);
2230 ConstantSDNode *B = dyn_cast<ConstantSDNode>(Shl->getOperand(1));
2231 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
2232
2233 if (B && C) {
2234 uint32_t BVal = B->getZExtValue();
2235 uint32_t CVal = C->getZExtValue();
2236
2237 if (0 < BVal && BVal <= CVal && CVal < 32) {
2238 bool Signed = N->getOpcode() == ISD::SRA;
2239 ReplaceNode(N, getBFE32(Signed, SDLoc(N), Shl.getOperand(0), CVal - BVal,
2240 32 - CVal));
2241 return;
2242 }
2243 }
2244 SelectCode(N);
2245}
2246
2247void AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) {
2248 switch (N->getOpcode()) {
2249 case ISD::AND:
2250 if (N->getOperand(0).getOpcode() == ISD::SRL) {
2251 // "(a srl b) & mask" ---> "BFE_U32 a, b, popcount(mask)"
2252 // Predicate: isMask(mask)
2253 const SDValue &Srl = N->getOperand(0);
2254 ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(Srl.getOperand(1));
2255 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(N->getOperand(1));
2256
2257 if (Shift && Mask) {
2258 uint32_t ShiftVal = Shift->getZExtValue();
2259 uint32_t MaskVal = Mask->getZExtValue();
2260
2261 if (isMask_32(MaskVal)) {
2262 uint32_t WidthVal = llvm::popcount(MaskVal);
2263 ReplaceNode(N, getBFE32(false, SDLoc(N), Srl.getOperand(0), ShiftVal,
2264 WidthVal));
2265 return;
2266 }
2267 }
2268 }
2269 break;
2270 case ISD::SRL:
2271 if (N->getOperand(0).getOpcode() == ISD::AND) {
2272 // "(a & mask) srl b)" ---> "BFE_U32 a, b, popcount(mask >> b)"
2273 // Predicate: isMask(mask >> b)
2274 const SDValue &And = N->getOperand(0);
2275 ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(N->getOperand(1));
2276 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(And->getOperand(1));
2277
2278 if (Shift && Mask) {
2279 uint32_t ShiftVal = Shift->getZExtValue();
2280 uint32_t MaskVal = Mask->getZExtValue() >> ShiftVal;
2281
2282 if (isMask_32(MaskVal)) {
2283 uint32_t WidthVal = llvm::popcount(MaskVal);
2284 ReplaceNode(N, getBFE32(false, SDLoc(N), And.getOperand(0), ShiftVal,
2285 WidthVal));
2286 return;
2287 }
2288 }
2289 } else if (N->getOperand(0).getOpcode() == ISD::SHL) {
2290 SelectS_BFEFromShifts(N);
2291 return;
2292 }
2293 break;
2294 case ISD::SRA:
2295 if (N->getOperand(0).getOpcode() == ISD::SHL) {
2296 SelectS_BFEFromShifts(N);
2297 return;
2298 }
2299 break;
2300
2302 // sext_inreg (srl x, 16), i8 -> bfe_i32 x, 16, 8
2303 SDValue Src = N->getOperand(0);
2304 if (Src.getOpcode() != ISD::SRL)
2305 break;
2306
2307 const ConstantSDNode *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(1));
2308 if (!Amt)
2309 break;
2310
2311 unsigned Width = cast<VTSDNode>(N->getOperand(1))->getVT().getSizeInBits();
2312 ReplaceNode(N, getBFE32(true, SDLoc(N), Src.getOperand(0),
2313 Amt->getZExtValue(), Width));
2314 return;
2315 }
2316 }
2317
2318 SelectCode(N);
2319}
2320
2321bool AMDGPUDAGToDAGISel::isCBranchSCC(const SDNode *N) const {
2322 assert(N->getOpcode() == ISD::BRCOND);
2323 if (!N->hasOneUse())
2324 return false;
2325
2326 SDValue Cond = N->getOperand(1);
2327 if (Cond.getOpcode() == ISD::CopyToReg)
2328 Cond = Cond.getOperand(2);
2329
2330 if (Cond.getOpcode() != ISD::SETCC || !Cond.hasOneUse())
2331 return false;
2332
2333 MVT VT = Cond.getOperand(0).getSimpleValueType();
2334 if (VT == MVT::i32)
2335 return true;
2336
2337 if (VT == MVT::i64) {
2338 auto ST = static_cast<const GCNSubtarget *>(Subtarget);
2339
2340 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
2341 return (CC == ISD::SETEQ || CC == ISD::SETNE) && ST->hasScalarCompareEq64();
2342 }
2343
2344 return false;
2345}
2346
2347static SDValue combineBallotPattern(SDValue VCMP, bool &Negate) {
2348 assert(VCMP->getOpcode() == AMDGPUISD::SETCC);
2349 // Special case for amdgcn.ballot:
2350 // %Cond = i1 (and/or combination of i1 ISD::SETCCs)
2351 // %VCMP = i(WaveSize) AMDGPUISD::SETCC (ext %Cond), 0, setne/seteq
2352 // =>
2353 // Use i1 %Cond value instead of i(WaveSize) %VCMP.
2354 // This is possible because divergent ISD::SETCC is selected as V_CMP and
2355 // Cond becomes a i(WaveSize) full mask value.
2356 // Note that ballot doesn't use SETEQ condition but its easy to support it
2357 // here for completeness, so in this case Negate is set true on return.
2358 auto VCMP_CC = cast<CondCodeSDNode>(VCMP.getOperand(2))->get();
2359 if ((VCMP_CC == ISD::SETEQ || VCMP_CC == ISD::SETNE) &&
2360 isNullConstant(VCMP.getOperand(1))) {
2361
2362 auto Cond = VCMP.getOperand(0);
2363 if (ISD::isExtOpcode(Cond->getOpcode())) // Skip extension.
2364 Cond = Cond.getOperand(0);
2365
2366 if (isBoolSGPR(Cond)) {
2367 Negate = VCMP_CC == ISD::SETEQ;
2368 return Cond;
2369 }
2370 }
2371 return SDValue();
2372}
2373
2374void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
2375 SDValue Cond = N->getOperand(1);
2376
2377 if (Cond.isUndef()) {
2378 CurDAG->SelectNodeTo(N, AMDGPU::SI_BR_UNDEF, MVT::Other,
2379 N->getOperand(2), N->getOperand(0));
2380 return;
2381 }
2382
2383 const GCNSubtarget *ST = static_cast<const GCNSubtarget *>(Subtarget);
2384 const SIRegisterInfo *TRI = ST->getRegisterInfo();
2385
2386 bool UseSCCBr = isCBranchSCC(N) && isUniformBr(N);
2387 bool AndExec = !UseSCCBr;
2388 bool Negate = false;
2389
2390 if (Cond.getOpcode() == ISD::SETCC &&
2391 Cond->getOperand(0)->getOpcode() == AMDGPUISD::SETCC) {
2392 SDValue VCMP = Cond->getOperand(0);
2393 auto CC = cast<CondCodeSDNode>(Cond->getOperand(2))->get();
2394 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
2395 isNullConstant(Cond->getOperand(1)) &&
2396 // We may encounter ballot.i64 in wave32 mode on -O0.
2397 VCMP.getValueType().getSizeInBits() == ST->getWavefrontSize()) {
2398 // %VCMP = i(WaveSize) AMDGPUISD::SETCC ...
2399 // %C = i1 ISD::SETCC %VCMP, 0, setne/seteq
2400 // BRCOND i1 %C, %BB
2401 // =>
2402 // %VCMP = i(WaveSize) AMDGPUISD::SETCC ...
2403 // VCC = COPY i(WaveSize) %VCMP
2404 // S_CBRANCH_VCCNZ/VCCZ %BB
2405 Negate = CC == ISD::SETEQ;
2406 bool NegatedBallot = false;
2407 if (auto BallotCond = combineBallotPattern(VCMP, NegatedBallot)) {
2408 Cond = BallotCond;
2409 UseSCCBr = !BallotCond->isDivergent();
2410 Negate = Negate ^ NegatedBallot;
2411 } else {
2412 // TODO: don't use SCC here assuming that AMDGPUISD::SETCC is always
2413 // selected as V_CMP, but this may change for uniform condition.
2414 Cond = VCMP;
2415 UseSCCBr = false;
2416 }
2417 }
2418 // Cond is either V_CMP resulted from AMDGPUISD::SETCC or a combination of
2419 // V_CMPs resulted from ballot or ballot has uniform condition and SCC is
2420 // used.
2421 AndExec = false;
2422 }
2423
2424 unsigned BrOp =
2425 UseSCCBr ? (Negate ? AMDGPU::S_CBRANCH_SCC0 : AMDGPU::S_CBRANCH_SCC1)
2426 : (Negate ? AMDGPU::S_CBRANCH_VCCZ : AMDGPU::S_CBRANCH_VCCNZ);
2427 Register CondReg = UseSCCBr ? AMDGPU::SCC : TRI->getVCC();
2428 SDLoc SL(N);
2429
2430 if (AndExec) {
2431 // This is the case that we are selecting to S_CBRANCH_VCCNZ. We have not
2432 // analyzed what generates the vcc value, so we do not know whether vcc
2433 // bits for disabled lanes are 0. Thus we need to mask out bits for
2434 // disabled lanes.
2435 //
2436 // For the case that we select S_CBRANCH_SCC1 and it gets
2437 // changed to S_CBRANCH_VCCNZ in SIFixSGPRCopies, SIFixSGPRCopies calls
2438 // SIInstrInfo::moveToVALU which inserts the S_AND).
2439 //
2440 // We could add an analysis of what generates the vcc value here and omit
2441 // the S_AND when is unnecessary. But it would be better to add a separate
2442 // pass after SIFixSGPRCopies to do the unnecessary S_AND removal, so it
2443 // catches both cases.
2444 Cond = SDValue(CurDAG->getMachineNode(ST->isWave32() ? AMDGPU::S_AND_B32
2445 : AMDGPU::S_AND_B64,
2446 SL, MVT::i1,
2447 CurDAG->getRegister(ST->isWave32() ? AMDGPU::EXEC_LO
2448 : AMDGPU::EXEC,
2449 MVT::i1),
2450 Cond),
2451 0);
2452 }
2453
2454 SDValue VCC = CurDAG->getCopyToReg(N->getOperand(0), SL, CondReg, Cond);
2455 CurDAG->SelectNodeTo(N, BrOp, MVT::Other,
2456 N->getOperand(2), // Basic Block
2457 VCC.getValue(0));
2458}
2459
2460void AMDGPUDAGToDAGISel::SelectFP_EXTEND(SDNode *N) {
2461 if (Subtarget->hasSALUFloatInsts() && N->getValueType(0) == MVT::f32 &&
2462 !N->isDivergent()) {
2463 SDValue Src = N->getOperand(0);
2464 if (Src.getValueType() == MVT::f16) {
2465 if (isExtractHiElt(Src, Src)) {
2466 CurDAG->SelectNodeTo(N, AMDGPU::S_CVT_HI_F32_F16, N->getVTList(),
2467 {Src});
2468 return;
2469 }
2470 }
2471 }
2472
2473 SelectCode(N);
2474}
2475
2476void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode *N, unsigned IntrID) {
2477 // The address is assumed to be uniform, so if it ends up in a VGPR, it will
2478 // be copied to an SGPR with readfirstlane.
2479 unsigned Opc = IntrID == Intrinsic::amdgcn_ds_append ?
2480 AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
2481
2482 SDValue Chain = N->getOperand(0);
2483 SDValue Ptr = N->getOperand(2);
2484 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2485 MachineMemOperand *MMO = M->getMemOperand();
2486 bool IsGDS = M->getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
2487
2490 SDValue PtrBase = Ptr.getOperand(0);
2491 SDValue PtrOffset = Ptr.getOperand(1);
2492
2493 const APInt &OffsetVal = PtrOffset->getAsAPIntVal();
2494 if (isDSOffsetLegal(PtrBase, OffsetVal.getZExtValue())) {
2495 N = glueCopyToM0(N, PtrBase);
2496 Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i32);
2497 }
2498 }
2499
2500 if (!Offset) {
2501 N = glueCopyToM0(N, Ptr);
2502 Offset = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32);
2503 }
2504
2505 SDValue Ops[] = {
2506 Offset,
2507 CurDAG->getTargetConstant(IsGDS, SDLoc(), MVT::i32),
2508 Chain,
2509 N->getOperand(N->getNumOperands() - 1) // New glue
2510 };
2511
2512 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
2513 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
2514}
2515
2516// We need to handle this here because tablegen doesn't support matching
2517// instructions with multiple outputs.
2518void AMDGPUDAGToDAGISel::SelectDSBvhStackIntrinsic(SDNode *N) {
2519 unsigned Opc = AMDGPU::DS_BVH_STACK_RTN_B32;
2520 SDValue Ops[] = {N->getOperand(2), N->getOperand(3), N->getOperand(4),
2521 N->getOperand(5), N->getOperand(0)};
2522
2523 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2524 MachineMemOperand *MMO = M->getMemOperand();
2525 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
2526 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
2527}
2528
2529static unsigned gwsIntrinToOpcode(unsigned IntrID) {
2530 switch (IntrID) {
2531 case Intrinsic::amdgcn_ds_gws_init:
2532 return AMDGPU::DS_GWS_INIT;
2533 case Intrinsic::amdgcn_ds_gws_barrier:
2534 return AMDGPU::DS_GWS_BARRIER;
2535 case Intrinsic::amdgcn_ds_gws_sema_v:
2536 return AMDGPU::DS_GWS_SEMA_V;
2537 case Intrinsic::amdgcn_ds_gws_sema_br:
2538 return AMDGPU::DS_GWS_SEMA_BR;
2539 case Intrinsic::amdgcn_ds_gws_sema_p:
2540 return AMDGPU::DS_GWS_SEMA_P;
2541 case Intrinsic::amdgcn_ds_gws_sema_release_all:
2542 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
2543 default:
2544 llvm_unreachable("not a gws intrinsic");
2545 }
2546}
2547
2548void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) {
2549 if (!Subtarget->hasGWS() ||
2550 (IntrID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
2551 !Subtarget->hasGWSSemaReleaseAll())) {
2552 // Let this error.
2553 SelectCode(N);
2554 return;
2555 }
2556
2557 // Chain, intrinsic ID, vsrc, offset
2558 const bool HasVSrc = N->getNumOperands() == 4;
2559 assert(HasVSrc || N->getNumOperands() == 3);
2560
2561 SDLoc SL(N);
2562 SDValue BaseOffset = N->getOperand(HasVSrc ? 3 : 2);
2563 int ImmOffset = 0;
2564 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2565 MachineMemOperand *MMO = M->getMemOperand();
2566
2567 // Don't worry if the offset ends up in a VGPR. Only one lane will have
2568 // effect, so SIFixSGPRCopies will validly insert readfirstlane.
2569
2570 // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
2571 // offset field) % 64. Some versions of the programming guide omit the m0
2572 // part, or claim it's from offset 0.
2573 if (ConstantSDNode *ConstOffset = dyn_cast<ConstantSDNode>(BaseOffset)) {
2574 // If we have a constant offset, try to use the 0 in m0 as the base.
2575 // TODO: Look into changing the default m0 initialization value. If the
2576 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
2577 // the immediate offset.
2578 glueCopyToM0(N, CurDAG->getTargetConstant(0, SL, MVT::i32));
2579 ImmOffset = ConstOffset->getZExtValue();
2580 } else {
2581 if (CurDAG->isBaseWithConstantOffset(BaseOffset)) {
2582 ImmOffset = BaseOffset.getConstantOperandVal(1);
2583 BaseOffset = BaseOffset.getOperand(0);
2584 }
2585
2586 // Prefer to do the shift in an SGPR since it should be possible to use m0
2587 // as the result directly. If it's already an SGPR, it will be eliminated
2588 // later.
2589 SDNode *SGPROffset
2590 = CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL, MVT::i32,
2591 BaseOffset);
2592 // Shift to offset in m0
2593 SDNode *M0Base
2594 = CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,
2595 SDValue(SGPROffset, 0),
2596 CurDAG->getTargetConstant(16, SL, MVT::i32));
2597 glueCopyToM0(N, SDValue(M0Base, 0));
2598 }
2599
2600 SDValue Chain = N->getOperand(0);
2601 SDValue OffsetField = CurDAG->getTargetConstant(ImmOffset, SL, MVT::i32);
2602
2603 const unsigned Opc = gwsIntrinToOpcode(IntrID);
2605 if (HasVSrc)
2606 Ops.push_back(N->getOperand(2));
2607 Ops.push_back(OffsetField);
2608 Ops.push_back(Chain);
2609
2610 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
2611 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
2612}
2613
2614void AMDGPUDAGToDAGISel::SelectInterpP1F16(SDNode *N) {
2615 if (Subtarget->getLDSBankCount() != 16) {
2616 // This is a single instruction with a pattern.
2617 SelectCode(N);
2618 return;
2619 }
2620
2621 SDLoc DL(N);
2622
2623 // This requires 2 instructions. It is possible to write a pattern to support
2624 // this, but the generated isel emitter doesn't correctly deal with multiple
2625 // output instructions using the same physical register input. The copy to m0
2626 // is incorrectly placed before the second instruction.
2627 //
2628 // TODO: Match source modifiers.
2629 //
2630 // def : Pat <
2631 // (int_amdgcn_interp_p1_f16
2632 // (VOP3Mods f32:$src0, i32:$src0_modifiers),
2633 // (i32 timm:$attrchan), (i32 timm:$attr),
2634 // (i1 timm:$high), M0),
2635 // (V_INTERP_P1LV_F16 $src0_modifiers, VGPR_32:$src0, timm:$attr,
2636 // timm:$attrchan, 0,
2637 // (V_INTERP_MOV_F32 2, timm:$attr, timm:$attrchan), timm:$high)> {
2638 // let Predicates = [has16BankLDS];
2639 // }
2640
2641 // 16 bank LDS
2642 SDValue ToM0 = CurDAG->getCopyToReg(CurDAG->getEntryNode(), DL, AMDGPU::M0,
2643 N->getOperand(5), SDValue());
2644
2645 SDVTList VTs = CurDAG->getVTList(MVT::f32, MVT::Other);
2646
2647 SDNode *InterpMov =
2648 CurDAG->getMachineNode(AMDGPU::V_INTERP_MOV_F32, DL, VTs, {
2649 CurDAG->getTargetConstant(2, DL, MVT::i32), // P0
2650 N->getOperand(3), // Attr
2651 N->getOperand(2), // Attrchan
2652 ToM0.getValue(1) // In glue
2653 });
2654
2655 SDNode *InterpP1LV =
2656 CurDAG->getMachineNode(AMDGPU::V_INTERP_P1LV_F16, DL, MVT::f32, {
2657 CurDAG->getTargetConstant(0, DL, MVT::i32), // $src0_modifiers
2658 N->getOperand(1), // Src0
2659 N->getOperand(3), // Attr
2660 N->getOperand(2), // Attrchan
2661 CurDAG->getTargetConstant(0, DL, MVT::i32), // $src2_modifiers
2662 SDValue(InterpMov, 0), // Src2 - holds two f16 values selected by high
2663 N->getOperand(4), // high
2664 CurDAG->getTargetConstant(0, DL, MVT::i1), // $clamp
2665 CurDAG->getTargetConstant(0, DL, MVT::i32), // $omod
2666 SDValue(InterpMov, 1)
2667 });
2668
2669 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), SDValue(InterpP1LV, 0));
2670}
2671
2672void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
2673 unsigned IntrID = N->getConstantOperandVal(1);
2674 switch (IntrID) {
2675 case Intrinsic::amdgcn_ds_append:
2676 case Intrinsic::amdgcn_ds_consume: {
2677 if (N->getValueType(0) != MVT::i32)
2678 break;
2679 SelectDSAppendConsume(N, IntrID);
2680 return;
2681 }
2682 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2683 SelectDSBvhStackIntrinsic(N);
2684 return;
2685 }
2686
2687 SelectCode(N);
2688}
2689
2690void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) {
2691 unsigned IntrID = N->getConstantOperandVal(0);
2692 unsigned Opcode = AMDGPU::INSTRUCTION_LIST_END;
2693 SDNode *ConvGlueNode = N->getGluedNode();
2694 if (ConvGlueNode) {
2695 // FIXME: Possibly iterate over multiple glue nodes?
2696 assert(ConvGlueNode->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
2697 ConvGlueNode = ConvGlueNode->getOperand(0).getNode();
2698 ConvGlueNode =
2699 CurDAG->getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, {},
2700 MVT::Glue, SDValue(ConvGlueNode, 0));
2701 } else {
2702 ConvGlueNode = nullptr;
2703 }
2704 switch (IntrID) {
2705 case Intrinsic::amdgcn_wqm:
2706 Opcode = AMDGPU::WQM;
2707 break;
2708 case Intrinsic::amdgcn_softwqm:
2709 Opcode = AMDGPU::SOFT_WQM;
2710 break;
2711 case Intrinsic::amdgcn_wwm:
2712 case Intrinsic::amdgcn_strict_wwm:
2713 Opcode = AMDGPU::STRICT_WWM;
2714 break;
2715 case Intrinsic::amdgcn_strict_wqm:
2716 Opcode = AMDGPU::STRICT_WQM;
2717 break;
2718 case Intrinsic::amdgcn_interp_p1_f16:
2719 SelectInterpP1F16(N);
2720 return;
2721 case Intrinsic::amdgcn_inverse_ballot:
2722 switch (N->getOperand(1).getValueSizeInBits()) {
2723 case 32:
2724 Opcode = AMDGPU::S_INVERSE_BALLOT_U32;
2725 break;
2726 case 64:
2727 Opcode = AMDGPU::S_INVERSE_BALLOT_U64;
2728 break;
2729 default:
2730 llvm_unreachable("Unsupported size for inverse ballot mask.");
2731 }
2732 break;
2733 default:
2734 SelectCode(N);
2735 break;
2736 }
2737
2738 if (Opcode != AMDGPU::INSTRUCTION_LIST_END) {
2739 SDValue Src = N->getOperand(1);
2740 CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), {Src});
2741 }
2742
2743 if (ConvGlueNode) {
2744 SmallVector<SDValue, 4> NewOps(N->op_begin(), N->op_end());
2745 NewOps.push_back(SDValue(ConvGlueNode, 0));
2746 CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), NewOps);
2747 }
2748}
2749
2750void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) {
2751 unsigned IntrID = N->getConstantOperandVal(1);
2752 switch (IntrID) {
2753 case Intrinsic::amdgcn_ds_gws_init:
2754 case Intrinsic::amdgcn_ds_gws_barrier:
2755 case Intrinsic::amdgcn_ds_gws_sema_v:
2756 case Intrinsic::amdgcn_ds_gws_sema_br:
2757 case Intrinsic::amdgcn_ds_gws_sema_p:
2758 case Intrinsic::amdgcn_ds_gws_sema_release_all:
2759 SelectDS_GWS(N, IntrID);
2760 return;
2761 default:
2762 break;
2763 }
2764
2765 SelectCode(N);
2766}
2767
2768void AMDGPUDAGToDAGISel::SelectWAVE_ADDRESS(SDNode *N) {
2769 SDValue Log2WaveSize =
2770 CurDAG->getTargetConstant(Subtarget->getWavefrontSizeLog2(), SDLoc(N), MVT::i32);
2771 CurDAG->SelectNodeTo(N, AMDGPU::S_LSHR_B32, N->getVTList(),
2772 {N->getOperand(0), Log2WaveSize});
2773}
2774
2775void AMDGPUDAGToDAGISel::SelectSTACKRESTORE(SDNode *N) {
2776 SDValue SrcVal = N->getOperand(1);
2777 if (SrcVal.getValueType() != MVT::i32) {
2778 SelectCode(N); // Emit default error
2779 return;
2780 }
2781
2782 SDValue CopyVal;
2784 SDLoc SL(N);
2785
2786 if (SrcVal.getOpcode() == AMDGPUISD::WAVE_ADDRESS) {
2787 CopyVal = SrcVal.getOperand(0);
2788 } else {
2789 SDValue Log2WaveSize = CurDAG->getTargetConstant(
2790 Subtarget->getWavefrontSizeLog2(), SL, MVT::i32);
2791
2792 if (N->isDivergent()) {
2793 SrcVal = SDValue(CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL,
2794 MVT::i32, SrcVal),
2795 0);
2796 }
2797
2798 CopyVal = SDValue(CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,
2799 {SrcVal, Log2WaveSize}),
2800 0);
2801 }
2802
2803 SDValue CopyToSP = CurDAG->getCopyToReg(N->getOperand(0), SL, SP, CopyVal);
2804 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), CopyToSP);
2805}
2806
2807bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src,
2808 unsigned &Mods,
2809 bool IsCanonicalizing,
2810 bool AllowAbs) const {
2811 Mods = SISrcMods::NONE;
2812 Src = In;
2813
2814 if (Src.getOpcode() == ISD::FNEG) {
2815 Mods |= SISrcMods::NEG;
2816 Src = Src.getOperand(0);
2817 } else if (Src.getOpcode() == ISD::FSUB && IsCanonicalizing) {
2818 // Fold fsub [+-]0 into fneg. This may not have folded depending on the
2819 // denormal mode, but we're implicitly canonicalizing in a source operand.
2820 auto *LHS = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
2821 if (LHS && LHS->isZero()) {
2822 Mods |= SISrcMods::NEG;
2823 Src = Src.getOperand(1);
2824 }
2825 }
2826
2827 if (AllowAbs && Src.getOpcode() == ISD::FABS) {
2828 Mods |= SISrcMods::ABS;
2829 Src = Src.getOperand(0);
2830 }
2831
2832 return true;
2833}
2834
2835bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src,
2836 SDValue &SrcMods) const {
2837 unsigned Mods;
2838 if (SelectVOP3ModsImpl(In, Src, Mods, /*IsCanonicalizing=*/true,
2839 /*AllowAbs=*/true)) {
2840 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2841 return true;
2842 }
2843
2844 return false;
2845}
2846
2847bool AMDGPUDAGToDAGISel::SelectVOP3ModsNonCanonicalizing(
2848 SDValue In, SDValue &Src, SDValue &SrcMods) const {
2849 unsigned Mods;
2850 if (SelectVOP3ModsImpl(In, Src, Mods, /*IsCanonicalizing=*/false,
2851 /*AllowAbs=*/true)) {
2852 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2853 return true;
2854 }
2855
2856 return false;
2857}
2858
2859bool AMDGPUDAGToDAGISel::SelectVOP3BMods(SDValue In, SDValue &Src,
2860 SDValue &SrcMods) const {
2861 unsigned Mods;
2862 if (SelectVOP3ModsImpl(In, Src, Mods,
2863 /*IsCanonicalizing=*/true,
2864 /*AllowAbs=*/false)) {
2865 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2866 return true;
2867 }
2868
2869 return false;
2870}
2871
2872bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src) const {
2873 if (In.getOpcode() == ISD::FABS || In.getOpcode() == ISD::FNEG)
2874 return false;
2875
2876 Src = In;
2877 return true;
2878}
2879
2880bool AMDGPUDAGToDAGISel::SelectVINTERPModsImpl(SDValue In, SDValue &Src,
2881 SDValue &SrcMods,
2882 bool OpSel) const {
2883 unsigned Mods;
2884 if (SelectVOP3ModsImpl(In, Src, Mods,
2885 /*IsCanonicalizing=*/true,
2886 /*AllowAbs=*/false)) {
2887 if (OpSel)
2888 Mods |= SISrcMods::OP_SEL_0;
2889 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2890 return true;
2891 }
2892
2893 return false;
2894}
2895
2896bool AMDGPUDAGToDAGISel::SelectVINTERPMods(SDValue In, SDValue &Src,
2897 SDValue &SrcMods) const {
2898 return SelectVINTERPModsImpl(In, Src, SrcMods, /* OpSel */ false);
2899}
2900
2901bool AMDGPUDAGToDAGISel::SelectVINTERPModsHi(SDValue In, SDValue &Src,
2902 SDValue &SrcMods) const {
2903 return SelectVINTERPModsImpl(In, Src, SrcMods, /* OpSel */ true);
2904}
2905
2906bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src,
2907 SDValue &SrcMods, SDValue &Clamp,
2908 SDValue &Omod) const {
2909 SDLoc DL(In);
2910 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
2911 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
2912
2913 return SelectVOP3Mods(In, Src, SrcMods);
2914}
2915
2916bool AMDGPUDAGToDAGISel::SelectVOP3BMods0(SDValue In, SDValue &Src,
2917 SDValue &SrcMods, SDValue &Clamp,
2918 SDValue &Omod) const {
2919 SDLoc DL(In);
2920 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
2921 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
2922
2923 return SelectVOP3BMods(In, Src, SrcMods);
2924}
2925
2926bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In, SDValue &Src,
2927 SDValue &Clamp, SDValue &Omod) const {
2928 Src = In;
2929
2930 SDLoc DL(In);
2931 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
2932 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
2933
2934 return true;
2935}
2936
2937bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
2938 SDValue &SrcMods, bool IsDOT) const {
2939 unsigned Mods = SISrcMods::NONE;
2940 Src = In;
2941
2942 // TODO: Handle G_FSUB 0 as fneg
2943 if (Src.getOpcode() == ISD::FNEG) {
2945 Src = Src.getOperand(0);
2946 }
2947
2948 if (Src.getOpcode() == ISD::BUILD_VECTOR && Src.getNumOperands() == 2 &&
2949 (!IsDOT || !Subtarget->hasDOTOpSelHazard())) {
2950 unsigned VecMods = Mods;
2951
2952 SDValue Lo = stripBitcast(Src.getOperand(0));
2953 SDValue Hi = stripBitcast(Src.getOperand(1));
2954
2955 if (Lo.getOpcode() == ISD::FNEG) {
2956 Lo = stripBitcast(Lo.getOperand(0));
2957 Mods ^= SISrcMods::NEG;
2958 }
2959
2960 if (Hi.getOpcode() == ISD::FNEG) {
2961 Hi = stripBitcast(Hi.getOperand(0));
2962 Mods ^= SISrcMods::NEG_HI;
2963 }
2964
2965 if (isExtractHiElt(Lo, Lo))
2966 Mods |= SISrcMods::OP_SEL_0;
2967
2968 if (isExtractHiElt(Hi, Hi))
2969 Mods |= SISrcMods::OP_SEL_1;
2970
2971 unsigned VecSize = Src.getValueSizeInBits();
2972 Lo = stripExtractLoElt(Lo);
2973 Hi = stripExtractLoElt(Hi);
2974
2975 if (Lo.getValueSizeInBits() > VecSize) {
2977 (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
2978 MVT::getIntegerVT(VecSize), Lo);
2979 }
2980
2981 if (Hi.getValueSizeInBits() > VecSize) {
2983 (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
2984 MVT::getIntegerVT(VecSize), Hi);
2985 }
2986
2987 assert(Lo.getValueSizeInBits() <= VecSize &&
2988 Hi.getValueSizeInBits() <= VecSize);
2989
2990 if (Lo == Hi && !isInlineImmediate(Lo.getNode())) {
2991 // Really a scalar input. Just select from the low half of the register to
2992 // avoid packing.
2993
2994 if (VecSize == 32 || VecSize == Lo.getValueSizeInBits()) {
2995 Src = Lo;
2996 } else {
2997 assert(Lo.getValueSizeInBits() == 32 && VecSize == 64);
2998
2999 SDLoc SL(In);
3001 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, SL,
3002 Lo.getValueType()), 0);
3003 auto RC = Lo->isDivergent() ? AMDGPU::VReg_64RegClassID
3004 : AMDGPU::SReg_64RegClassID;
3005 const SDValue Ops[] = {
3006 CurDAG->getTargetConstant(RC, SL, MVT::i32),
3007 Lo, CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
3008 Undef, CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32) };
3009
3010 Src = SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, SL,
3011 Src.getValueType(), Ops), 0);
3012 }
3013 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3014 return true;
3015 }
3016
3017 if (VecSize == 64 && Lo == Hi && isa<ConstantFPSDNode>(Lo)) {
3018 uint64_t Lit = cast<ConstantFPSDNode>(Lo)->getValueAPF()
3019 .bitcastToAPInt().getZExtValue();
3020 if (AMDGPU::isInlinableLiteral32(Lit, Subtarget->hasInv2PiInlineImm())) {
3021 Src = CurDAG->getTargetConstant(Lit, SDLoc(In), MVT::i64);
3022 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3023 return true;
3024 }
3025 }
3026
3027 Mods = VecMods;
3028 }
3029
3030 // Packed instructions do not have abs modifiers.
3031 Mods |= SISrcMods::OP_SEL_1;
3032
3033 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3034 return true;
3035}
3036
3037bool AMDGPUDAGToDAGISel::SelectVOP3PModsDOT(SDValue In, SDValue &Src,
3038 SDValue &SrcMods) const {
3039 return SelectVOP3PMods(In, Src, SrcMods, true);
3040}
3041
3042bool AMDGPUDAGToDAGISel::SelectVOP3PModsNeg(SDValue In, SDValue &Src) const {
3043 const ConstantSDNode *C = cast<ConstantSDNode>(In);
3044 // Literal i1 value set in intrinsic, represents SrcMods for the next operand.
3045 // 1 promotes packed values to signed, 0 treats them as unsigned.
3046 assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value");
3047
3048 unsigned Mods = SISrcMods::OP_SEL_1;
3049 unsigned SrcSign = C->getZExtValue();
3050 if (SrcSign == 1)
3051 Mods ^= SISrcMods::NEG;
3052
3053 Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3054 return true;
3055}
3056
3057bool AMDGPUDAGToDAGISel::SelectWMMAOpSelVOP3PMods(SDValue In,
3058 SDValue &Src) const {
3059 const ConstantSDNode *C = cast<ConstantSDNode>(In);
3060 assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value");
3061
3062 unsigned Mods = SISrcMods::OP_SEL_1;
3063 unsigned SrcVal = C->getZExtValue();
3064 if (SrcVal == 1)
3065 Mods |= SISrcMods::OP_SEL_0;
3066
3067 Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3068 return true;
3069}
3070
3072 llvm::SelectionDAG *CurDAG,
3073 const SDLoc &DL) {
3074 unsigned DstRegClass;
3075 EVT DstTy;
3076 switch (Elts.size()) {
3077 case 8:
3078 DstRegClass = AMDGPU::VReg_256RegClassID;
3079 DstTy = MVT::v8i32;
3080 break;
3081 case 4:
3082 DstRegClass = AMDGPU::VReg_128RegClassID;
3083 DstTy = MVT::v4i32;
3084 break;
3085 case 2:
3086 DstRegClass = AMDGPU::VReg_64RegClassID;
3087 DstTy = MVT::v2i32;
3088 break;
3089 default:
3090 llvm_unreachable("unhandled Reg sequence size");
3091 }
3092
3094 Ops.push_back(CurDAG->getTargetConstant(DstRegClass, DL, MVT::i32));
3095 for (unsigned i = 0; i < Elts.size(); ++i) {
3096 Ops.push_back(Elts[i]);
3097 Ops.push_back(CurDAG->getTargetConstant(
3099 }
3100 return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, DstTy, Ops);
3101}
3102
3104 llvm::SelectionDAG *CurDAG,
3105 const SDLoc &DL) {
3106 SmallVector<SDValue, 8> PackedElts;
3107 assert("unhandled Reg sequence size" &&
3108 (Elts.size() == 8 || Elts.size() == 16));
3109
3110 // Pack 16-bit elements in pairs into 32-bit register. If both elements are
3111 // unpacked from 32-bit source use it, otherwise pack them using v_perm.
3112 for (unsigned i = 0; i < Elts.size(); i += 2) {
3113 SDValue LoSrc = stripExtractLoElt(stripBitcast(Elts[i]));
3114 SDValue HiSrc;
3115 if (isExtractHiElt(Elts[i + 1], HiSrc) && LoSrc == HiSrc) {
3116 PackedElts.push_back(HiSrc);
3117 } else {
3118 SDValue PackLoLo = CurDAG->getTargetConstant(0x05040100, DL, MVT::i32);
3119 MachineSDNode *Packed =
3120 CurDAG->getMachineNode(AMDGPU::V_PERM_B32_e64, DL, MVT::i32,
3121 {Elts[i + 1], Elts[i], PackLoLo});
3122 PackedElts.push_back(SDValue(Packed, 0));
3123 }
3124 }
3125
3126 return buildRegSequence32(PackedElts, CurDAG, DL);
3127}
3128
3130 llvm::SelectionDAG *CurDAG,
3131 const SDLoc &DL, unsigned ElementSize) {
3132 if (ElementSize == 16)
3133 return buildRegSequence16(Elts, CurDAG, DL);
3134 if (ElementSize == 32)
3135 return buildRegSequence32(Elts, CurDAG, DL);
3136 llvm_unreachable("Unhandled element size");
3137}
3138
3139static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods,
3141 llvm::SelectionDAG *CurDAG, const SDLoc &DL,
3142 unsigned ElementSize) {
3143 if (ModOpcode == ISD::FNEG) {
3144 Mods |= SISrcMods::NEG;
3145 // Check if all elements also have abs modifier
3146 SmallVector<SDValue, 8> NegAbsElts;
3147 for (auto El : Elts) {
3148 if (El.getOpcode() != ISD::FABS)
3149 break;
3150 NegAbsElts.push_back(El->getOperand(0));
3151 }
3152 if (Elts.size() != NegAbsElts.size()) {
3153 // Neg
3154 Src = SDValue(buildRegSequence(Elts, CurDAG, DL, ElementSize), 0);
3155 } else {
3156 // Neg and Abs
3157 Mods |= SISrcMods::NEG_HI;
3158 Src = SDValue(buildRegSequence(NegAbsElts, CurDAG, DL, ElementSize), 0);
3159 }
3160 } else {
3161 assert(ModOpcode == ISD::FABS);
3162 // Abs
3163 Mods |= SISrcMods::NEG_HI;
3164 Src = SDValue(buildRegSequence(Elts, CurDAG, DL, ElementSize), 0);
3165 }
3166}
3167
3168// Check all f16 elements for modifiers while looking through b32 and v2b16
3169// build vector, stop if element does not satisfy ModifierCheck.
3170static void
3172 std::function<bool(SDValue)> ModifierCheck) {
3173 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3174 if (auto *F16Pair =
3175 dyn_cast<BuildVectorSDNode>(stripBitcast(BV->getOperand(i)))) {
3176 for (unsigned i = 0; i < F16Pair->getNumOperands(); ++i) {
3177 SDValue ElF16 = stripBitcast(F16Pair->getOperand(i));
3178 if (!ModifierCheck(ElF16))
3179 break;
3180 }
3181 }
3182 }
3183}
3184
3185bool AMDGPUDAGToDAGISel::SelectWMMAModsF16Neg(SDValue In, SDValue &Src,
3186 SDValue &SrcMods) const {
3187 Src = In;
3188 unsigned Mods = SISrcMods::OP_SEL_1;
3189
3190 // mods are on f16 elements
3191 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3193
3194 checkWMMAElementsModifiersF16(BV, [&](SDValue Element) -> bool {
3195 if (Element.getOpcode() != ISD::FNEG)
3196 return false;
3197 EltsF16.push_back(Element.getOperand(0));
3198 return true;
3199 });
3200
3201 // All elements have neg modifier
3202 if (BV->getNumOperands() * 2 == EltsF16.size()) {
3203 Src = SDValue(buildRegSequence16(EltsF16, CurDAG, SDLoc(In)), 0);
3204 Mods |= SISrcMods::NEG;
3205 Mods |= SISrcMods::NEG_HI;
3206 }
3207 }
3208
3209 // mods are on v2f16 elements
3210 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3211 SmallVector<SDValue, 8> EltsV2F16;
3212 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3213 SDValue ElV2f16 = stripBitcast(BV->getOperand(i));
3214 // Based on first element decide which mod we match, neg or abs
3215 if (ElV2f16.getOpcode() != ISD::FNEG)
3216 break;
3217 EltsV2F16.push_back(ElV2f16.getOperand(0));
3218 }
3219
3220 // All pairs of elements have neg modifier
3221 if (BV->getNumOperands() == EltsV2F16.size()) {
3222 Src = SDValue(buildRegSequence32(EltsV2F16, CurDAG, SDLoc(In)), 0);
3223 Mods |= SISrcMods::NEG;
3224 Mods |= SISrcMods::NEG_HI;
3225 }
3226 }
3227
3228 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3229 return true;
3230}
3231
3232bool AMDGPUDAGToDAGISel::SelectWMMAModsF16NegAbs(SDValue In, SDValue &Src,
3233 SDValue &SrcMods) const {
3234 Src = In;
3235 unsigned Mods = SISrcMods::OP_SEL_1;
3236 unsigned ModOpcode;
3237
3238 // mods are on f16 elements
3239 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3241 checkWMMAElementsModifiersF16(BV, [&](SDValue ElF16) -> bool {
3242 // Based on first element decide which mod we match, neg or abs
3243 if (EltsF16.empty())
3244 ModOpcode = (ElF16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3245 if (ElF16.getOpcode() != ModOpcode)
3246 return false;
3247 EltsF16.push_back(ElF16.getOperand(0));
3248 return true;
3249 });
3250
3251 // All elements have ModOpcode modifier
3252 if (BV->getNumOperands() * 2 == EltsF16.size())
3253 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF16, Src, CurDAG, SDLoc(In),
3254 16);
3255 }
3256
3257 // mods are on v2f16 elements
3258 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3259 SmallVector<SDValue, 8> EltsV2F16;
3260
3261 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3262 SDValue ElV2f16 = stripBitcast(BV->getOperand(i));
3263 // Based on first element decide which mod we match, neg or abs
3264 if (EltsV2F16.empty())
3265 ModOpcode = (ElV2f16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3266 if (ElV2f16->getOpcode() != ModOpcode)
3267 break;
3268 EltsV2F16.push_back(ElV2f16->getOperand(0));
3269 }
3270
3271 // All elements have ModOpcode modifier
3272 if (BV->getNumOperands() == EltsV2F16.size())
3273 selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, CurDAG, SDLoc(In),
3274 32);
3275 }
3276
3277 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3278 return true;
3279}
3280
3281bool AMDGPUDAGToDAGISel::SelectWMMAModsF32NegAbs(SDValue In, SDValue &Src,
3282 SDValue &SrcMods) const {
3283 Src = In;
3284 unsigned Mods = SISrcMods::OP_SEL_1;
3286
3287 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3288 assert(BV->getNumOperands() > 0);
3289 // Based on first element decide which mod we match, neg or abs
3290 SDValue ElF32 = stripBitcast(BV->getOperand(0));
3291 unsigned ModOpcode =
3292 (ElF32.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3293 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3294 SDValue ElF32 = stripBitcast(BV->getOperand(i));
3295 if (ElF32.getOpcode() != ModOpcode)
3296 break;
3297 EltsF32.push_back(ElF32.getOperand(0));
3298 }
3299
3300 // All elements had ModOpcode modifier
3301 if (BV->getNumOperands() == EltsF32.size())
3302 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, CurDAG, SDLoc(In),
3303 32);
3304 }
3305
3306 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3307 return true;
3308}
3309
3310bool AMDGPUDAGToDAGISel::SelectWMMAVISrc(SDValue In, SDValue &Src) const {
3311 if (auto *BV = dyn_cast<BuildVectorSDNode>(In)) {
3312 BitVector UndefElements;
3313 if (SDValue Splat = BV->getSplatValue(&UndefElements))
3314 if (isInlineImmediate(Splat.getNode())) {
3315 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Splat)) {
3316 unsigned Imm = C->getAPIntValue().getSExtValue();
3317 Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32);
3318 return true;
3319 }
3320 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Splat)) {
3321 unsigned Imm = C->getValueAPF().bitcastToAPInt().getSExtValue();
3322 Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32);
3323 return true;
3324 }
3325 llvm_unreachable("unhandled Constant node");
3326 }
3327 }
3328
3329 // 16 bit splat
3330 SDValue SplatSrc32 = stripBitcast(In);
3331 if (auto *SplatSrc32BV = dyn_cast<BuildVectorSDNode>(SplatSrc32))
3332 if (SDValue Splat32 = SplatSrc32BV->getSplatValue()) {
3333 SDValue SplatSrc16 = stripBitcast(Splat32);
3334 if (auto *SplatSrc16BV = dyn_cast<BuildVectorSDNode>(SplatSrc16))
3335 if (SDValue Splat = SplatSrc16BV->getSplatValue()) {
3336 const SIInstrInfo *TII = Subtarget->getInstrInfo();
3337 std::optional<APInt> RawValue;
3338 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Splat))
3339 RawValue = C->getValueAPF().bitcastToAPInt();
3340 else if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Splat))
3341 RawValue = C->getAPIntValue();
3342
3343 if (RawValue.has_value()) {
3344 EVT VT = In.getValueType().getScalarType();
3345 if (VT.getSimpleVT() == MVT::f16 || VT.getSimpleVT() == MVT::bf16) {
3346 APFloat FloatVal(VT.getSimpleVT() == MVT::f16
3349 RawValue.value());
3350 if (TII->isInlineConstant(FloatVal)) {
3351 Src = CurDAG->getTargetConstant(RawValue.value(), SDLoc(In),
3352 MVT::i16);
3353 return true;
3354 }
3355 } else if (VT.getSimpleVT() == MVT::i16) {
3356 if (TII->isInlineConstant(RawValue.value())) {
3357 Src = CurDAG->getTargetConstant(RawValue.value(), SDLoc(In),
3358 MVT::i16);
3359 return true;
3360 }
3361 } else
3362 llvm_unreachable("unknown 16-bit type");
3363 }
3364 }
3365 }
3366
3367 return false;
3368}
3369
3370bool AMDGPUDAGToDAGISel::SelectSWMMACIndex8(SDValue In, SDValue &Src,
3371 SDValue &IndexKey) const {
3372 unsigned Key = 0;
3373 Src = In;
3374
3375 if (In.getOpcode() == ISD::SRL) {
3376 const llvm::SDValue &ShiftSrc = In.getOperand(0);
3377 ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(In.getOperand(1));
3378 if (ShiftSrc.getValueType().getSizeInBits() == 32 && ShiftAmt &&
3379 ShiftAmt->getZExtValue() % 8 == 0) {
3380 Key = ShiftAmt->getZExtValue() / 8;
3381 Src = ShiftSrc;
3382 }
3383 }
3384
3385 IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);
3386 return true;
3387}
3388
3389bool AMDGPUDAGToDAGISel::SelectSWMMACIndex16(SDValue In, SDValue &Src,
3390 SDValue &IndexKey) const {
3391 unsigned Key = 0;
3392 Src = In;
3393
3394 if (In.getOpcode() == ISD::SRL) {
3395 const llvm::SDValue &ShiftSrc = In.getOperand(0);
3396 ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(In.getOperand(1));
3397 if (ShiftSrc.getValueType().getSizeInBits() == 32 && ShiftAmt &&
3398 ShiftAmt->getZExtValue() == 16) {
3399 Key = 1;
3400 Src = ShiftSrc;
3401 }
3402 }
3403
3404 IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);
3405 return true;
3406}
3407
3408bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src,
3409 SDValue &SrcMods) const {
3410 Src = In;
3411 // FIXME: Handle op_sel
3412 SrcMods = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32);
3413 return true;
3414}
3415
3416bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods(SDValue In, SDValue &Src,
3417 SDValue &SrcMods) const {
3418 // FIXME: Handle op_sel
3419 return SelectVOP3Mods(In, Src, SrcMods);
3420}
3421
3422// The return value is not whether the match is possible (which it always is),
3423// but whether or not it a conversion is really used.
3424bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src,
3425 unsigned &Mods) const {
3426 Mods = 0;
3427 SelectVOP3ModsImpl(In, Src, Mods);
3428
3429 if (Src.getOpcode() == ISD::FP_EXTEND) {
3430 Src = Src.getOperand(0);
3431 assert(Src.getValueType() == MVT::f16);
3432 Src = stripBitcast(Src);
3433
3434 // Be careful about folding modifiers if we already have an abs. fneg is
3435 // applied last, so we don't want to apply an earlier fneg.
3436 if ((Mods & SISrcMods::ABS) == 0) {
3437 unsigned ModsTmp;
3438 SelectVOP3ModsImpl(Src, Src, ModsTmp);
3439
3440 if ((ModsTmp & SISrcMods::NEG) != 0)
3441 Mods ^= SISrcMods::NEG;
3442
3443 if ((ModsTmp & SISrcMods::ABS) != 0)
3444 Mods |= SISrcMods::ABS;
3445 }
3446
3447 // op_sel/op_sel_hi decide the source type and source.
3448 // If the source's op_sel_hi is set, it indicates to do a conversion from fp16.
3449 // If the sources's op_sel is set, it picks the high half of the source
3450 // register.
3451
3452 Mods |= SISrcMods::OP_SEL_1;
3453 if (isExtractHiElt(Src, Src)) {
3454 Mods |= SISrcMods::OP_SEL_0;
3455
3456 // TODO: Should we try to look for neg/abs here?
3457 }
3458
3459 return true;
3460 }
3461
3462 return false;
3463}
3464
3465bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsExt(SDValue In, SDValue &Src,
3466 SDValue &SrcMods) const {
3467 unsigned Mods = 0;
3468 if (!SelectVOP3PMadMixModsImpl(In, Src, Mods))
3469 return false;
3470 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3471 return true;
3472}
3473
3474bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src,
3475 SDValue &SrcMods) const {
3476 unsigned Mods = 0;
3477 SelectVOP3PMadMixModsImpl(In, Src, Mods);
3478 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3479 return true;
3480}
3481
3482SDValue AMDGPUDAGToDAGISel::getHi16Elt(SDValue In) const {
3483 if (In.isUndef())
3484 return CurDAG->getUNDEF(MVT::i32);
3485
3486 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(In)) {
3487 SDLoc SL(In);
3488 return CurDAG->getConstant(C->getZExtValue() << 16, SL, MVT::i32);
3489 }
3490
3491 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(In)) {
3492 SDLoc SL(In);
3493 return CurDAG->getConstant(
3494 C->getValueAPF().bitcastToAPInt().getZExtValue() << 16, SL, MVT::i32);
3495 }
3496
3497 SDValue Src;
3498 if (isExtractHiElt(In, Src))
3499 return Src;
3500
3501 return SDValue();
3502}
3503
3504bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
3506
3507 const SIRegisterInfo *SIRI =
3508 static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
3509 const SIInstrInfo * SII =
3510 static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
3511
3512 unsigned Limit = 0;
3513 bool AllUsesAcceptSReg = true;
3514 for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
3515 Limit < 10 && U != E; ++U, ++Limit) {
3516 const TargetRegisterClass *RC = getOperandRegClass(*U, U.getOperandNo());
3517
3518 // If the register class is unknown, it could be an unknown
3519 // register class that needs to be an SGPR, e.g. an inline asm
3520 // constraint
3521 if (!RC || SIRI->isSGPRClass(RC))
3522 return false;
3523
3524 if (RC != &AMDGPU::VS_32RegClass && RC != &AMDGPU::VS_64RegClass) {
3525 AllUsesAcceptSReg = false;
3526 SDNode * User = *U;
3527 if (User->isMachineOpcode()) {
3528 unsigned Opc = User->getMachineOpcode();
3529 const MCInstrDesc &Desc = SII->get(Opc);
3530 if (Desc.isCommutable()) {
3531 unsigned OpIdx = Desc.getNumDefs() + U.getOperandNo();
3532 unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex;
3533 if (SII->findCommutedOpIndices(Desc, OpIdx, CommuteIdx1)) {
3534 unsigned CommutedOpNo = CommuteIdx1 - Desc.getNumDefs();
3535 const TargetRegisterClass *CommutedRC = getOperandRegClass(*U, CommutedOpNo);
3536 if (CommutedRC == &AMDGPU::VS_32RegClass ||
3537 CommutedRC == &AMDGPU::VS_64RegClass)
3538 AllUsesAcceptSReg = true;
3539 }
3540 }
3541 }
3542 // If "AllUsesAcceptSReg == false" so far we haven't succeeded
3543 // commuting current user. This means have at least one use
3544 // that strictly require VGPR. Thus, we will not attempt to commute
3545 // other user instructions.
3546 if (!AllUsesAcceptSReg)
3547 break;
3548 }
3549 }
3550 return !AllUsesAcceptSReg && (Limit < 10);
3551}
3552
3553bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode *N) const {
3554 auto Ld = cast<LoadSDNode>(N);
3555
3556 const MachineMemOperand *MMO = Ld->getMemOperand();
3557 if (N->isDivergent() && !AMDGPUInstrInfo::isUniformMMO(MMO))
3558 return false;
3559
3560 return MMO->getSize().hasValue() &&
3561 Ld->getAlign() >=
3562 Align(std::min(MMO->getSize().getValue().getKnownMinValue(),
3563 uint64_t(4))) &&
3564 ((Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
3565 Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ||
3566 (Subtarget->getScalarizeGlobalBehavior() &&
3567 Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
3568 Ld->isSimple() &&
3569 static_cast<const SITargetLowering *>(getTargetLowering())
3570 ->isMemOpHasNoClobberedMemOperand(N)));
3571}
3572
3575 *static_cast<const AMDGPUTargetLowering*>(getTargetLowering());
3576 bool IsModified = false;
3577 do {
3578 IsModified = false;
3579
3580 // Go over all selected nodes and try to fold them a bit more
3582 while (Position != CurDAG->allnodes_end()) {
3583 SDNode *Node = &*Position++;
3584 MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(Node);
3585 if (!MachineNode)
3586 continue;
3587
3588 SDNode *ResNode = Lowering.PostISelFolding(MachineNode, *CurDAG);
3589 if (ResNode != Node) {
3590 if (ResNode)
3591 ReplaceUses(Node, ResNode);
3592 IsModified = true;
3593 }
3594 }
3596 } while (IsModified);
3597}
3598
3599char AMDGPUDAGToDAGISel::ID = 0;
unsigned const MachineRegisterInfo * MRI
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool getBaseWithOffsetUsingSplitOR(SelectionDAG &DAG, SDValue Addr, SDValue &N0, SDValue &N1)
static MachineSDNode * buildRegSequence32(SmallVectorImpl< SDValue > &Elts, llvm::SelectionDAG *CurDAG, const SDLoc &DL)
static SDValue matchZExtFromI32(SDValue Op)
static unsigned gwsIntrinToOpcode(unsigned IntrID)
static MachineSDNode * buildRegSequence(SmallVectorImpl< SDValue > &Elts, llvm::SelectionDAG *CurDAG, const SDLoc &DL, unsigned ElementSize)
static SDValue SelectSAddrFI(SelectionDAG *CurDAG, SDValue SAddr)
static MemSDNode * findMemSDNode(SDNode *N)
static bool isNoUnsignedWrap(SDValue Addr)
static MachineSDNode * buildRegSequence16(SmallVectorImpl< SDValue > &Elts, llvm::SelectionDAG *CurDAG, const SDLoc &DL)
static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods, SmallVectorImpl< SDValue > &Elts, SDValue &Src, llvm::SelectionDAG *CurDAG, const SDLoc &DL, unsigned ElementSize)
static bool IsCopyFromSGPR(const SIRegisterInfo &TRI, SDValue Val)
static SDValue combineBallotPattern(SDValue VCMP, bool &Negate)
static void checkWMMAElementsModifiersF16(BuildVectorSDNode *BV, std::function< bool(SDValue)> ModifierCheck)
Defines an instruction selector for the AMDGPU target.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In, Register &Out)
static unsigned gwsIntrinToOpcode(unsigned IntrID)
static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods, SmallVectorImpl< Register > &Elts, Register &Src, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
Provides AMDGPU specific target descriptions.
Base class for AMDGPU specific classes of TargetSubtarget.
The AMDGPU TargetMachine interface definition for hw codegen targets.
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Addr
uint64_t Size
unsigned const TargetRegisterInfo * TRI
if(VerifyEach)
const char LLVMTargetMachineRef TM
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:59
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
pre isel intrinsic Pre ISel Intrinsic Lowering
Provides R600 specific target descriptions.
Interface definition for R600RegisterInfo.
const SmallVectorImpl< MachineOperand > & Cond
SI DAG Lowering interface definition.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
LLVM IR instance of the generic uniformity analysis.
Value * RHS
Value * LHS
support::ulittle16_t & Lo
Definition: aarch32.cpp:206
support::ulittle16_t & Hi
Definition: aarch32.cpp:205
AMDGPU specific code to select AMDGPU machine instructions for SelectionDAG operations.
void SelectBuildVector(SDNode *N, unsigned RegClassID)
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
void Select(SDNode *N) override
Main hook for targets to transform nodes into machine nodes.
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
void PreprocessISelDAG() override
PreprocessISelDAG - This hook allows targets to hack on the graph before instruction selection starts...
void PostprocessISelDAG() override
PostprocessISelDAG() - This hook allows the target to hack on the graph right after selection.
StringRef getPassName() const override
getPassName - Return a nice clean name for a pass.
AMDGPUDAGToDAGISel()=delete
bool matchLoadD16FromBuildVector(SDNode *N) const
static bool isUniformMMO(const MachineMemOperand *MMO)
unsigned getWavefrontSizeLog2() const
bool hasInv2PiInlineImm() const
static SDValue stripBitcast(SDValue Val)
static int64_t getNullPointerValue(unsigned AddrSpace)
Get the integer value of a null pointer in the given address space.
Class for arbitrary precision integers.
Definition: APInt.h:76
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:212
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1498
unsigned countr_one() const
Count the number of trailing one bits.
Definition: APInt.h:1613
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
LLVM Basic Block Representation.
Definition: BasicBlock.h:60
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:221
A "pseudo-class" with methods for operating on BUILD_VECTORs.
SDValue getSplatValue(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted value or a null value if this is not a splat.
uint64_t getZExtValue() const
int64_t getSExtValue() const
This class represents an Operation in the Expression.
Legacy analysis pass which computes a DominatorTree.
Definition: Dominators.h:317
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:311
int getLDSBankCount() const
Definition: GCNSubtarget.h:331
bool hasUsableDSOffset() const
True if the offset field of DS instructions works as expected.
Definition: GCNSubtarget.h:459
bool unsafeDSOffsetFoldingEnabled() const
Definition: GCNSubtarget.h:463
bool hasFlatInstOffsets() const
Definition: GCNSubtarget.h:618
bool hasDLInsts() const
Definition: GCNSubtarget.h:759
const SIInstrInfo * getInstrInfo() const override
Definition: GCNSubtarget.h:257
unsigned getConstantBusLimit(unsigned Opcode) const
bool hasMADIntraFwdBug() const
bool privateMemoryResourceIsRangeChecked() const
Definition: GCNSubtarget.h:544
bool hasSignedScratchOffsets() const
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:269
bool hasDOTOpSelHazard() const
bool d16PreservesUnusedBits() const
Definition: GCNSubtarget.h:686
bool hasRestrictedSOffset() const
bool hasFlatSegmentOffsetBug() const
Definition: GCNSubtarget.h:674
bool getScalarizeGlobalBehavior() const
Definition: GCNSubtarget.h:921
bool ldsRequiresM0Init() const
Return if most LDS instructions have an m0 use that require m0 to be initialized.
Definition: GCNSubtarget.h:696
bool hasFlatScratchSVSSwizzleBug() const
bool hasGWS() const
bool useFlatForGlobal() const
Definition: GCNSubtarget.h:522
Generation getGeneration() const
Definition: GCNSubtarget.h:308
bool hasGWSSemaReleaseAll() const
Definition: GCNSubtarget.h:710
bool hasAddr64() const
Definition: GCNSubtarget.h:372
bool hasAddNoCarry() const
Definition: GCNSubtarget.h:718
bool hasSALUFloatInsts() const
void checkSubtargetFeatures(const Function &F) const
Diagnose inconsistent subtarget features before attempting to codegen function F.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
bool hasValue() const
TypeSize getValue() const
SmallVector< LoopT *, 4 > getLoopsInPreorder() const
Return all of the loops in the function in preorder across the loop nests, with siblings in forward p...
The legacy pass manager's analysis pass to compute loop information.
Definition: LoopInfo.h:593
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
const Triple & getTargetTriple() const
Machine Value Type.
static MVT getIntegerVT(unsigned BitWidth)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
An SDNode that represents everything that will be needed to construct a MachineInstr.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getChain() const
EVT getMemoryVT() const
Return the type of the in-memory value.
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
This class provides iterator support for SDUse operands that use a specific SDNode.
Represents one node in the SelectionDAG.
const APInt & getAsAPIntVal() const
Helper method returns the APInt value of a ConstantSDNode.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool isDivergent() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumOperands() const
Return the number of values used by this operation.
const SDValue & getOperand(unsigned Num) const
bool isPredecessorOf(const SDNode *N) const
Return true if this node is a predecessor of N.
static use_iterator use_end()
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
const SDValue & getOperand(unsigned i) const
uint64_t getConstantOperandVal(unsigned i) const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx0, unsigned &SrcOpIdx1) const override
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
const TargetRegisterClass * getRegClass(unsigned RCID) const
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isSGPRClass(const TargetRegisterClass *RC)
SelectionDAGISel - This is the common base class used for SelectionDAG-based pattern-matching instruc...
std::unique_ptr< FunctionLoweringInfo > FuncInfo
const TargetLowering * TLI
MachineFunction * MF
const TargetInstrInfo * TII
void ReplaceUses(SDValue F, SDValue T)
ReplaceUses - replace all uses of the old node F with the use of the new node T.
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
void ReplaceNode(SDNode *F, SDNode *T)
Replace all uses of F with T, then remove F from the DAG.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const TargetLowering * getTargetLowering() const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:225
const TargetSubtargetInfo & getSubtarget() const
Definition: SelectionDAG.h:474
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
SDNode * SelectNodeTo(SDNode *N, unsigned MachineOpc, EVT VT)
These are used for target selectors to mutate the specified node to have the specified return type,...
SDNode * MorphNodeTo(SDNode *N, unsigned Opc, SDVTList VTs, ArrayRef< SDValue > Ops)
This mutates the specified node to have the specified return type, opcode, and operands.
allnodes_const_iterator allnodes_begin() const
Definition: SelectionDAG.h:531
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
allnodes_const_iterator allnodes_end() const
Definition: SelectionDAG.h:532
void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
SDValue getTargetFrameIndex(int FI, EVT VT)
Definition: SelectionDAG.h:727
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
bool SignBitIsZero(SDValue Op, unsigned Depth=0) const
Return true if the sign bit of Op is known to be zero.
SDValue getRegister(unsigned Reg, EVT VT)
void RemoveDeadNodes()
This method deletes all unreachable nodes in the SelectionDAG.
void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
const TargetMachine & getTarget() const
Definition: SelectionDAG.h:473
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, unsigned Reg, SDValue N)
Definition: SelectionDAG.h:773
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:676
bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:469
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition: SelectionDAG.h:554
ilist< SDNode >::iterator allnodes_iterator
Definition: SelectionDAG.h:534
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
static const unsigned CommuteAnyOperandIndex
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:76
const Triple & getTargetTriple() const
unsigned getID() const
Return the register class ID number.
ArchType getArch() const
Get the parsed architecture type of this triple.
Definition: Triple.h:373
Legacy analysis pass which computes a CycleInfo.
LLVM Value Representation.
Definition: Value.h:74
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:168
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
@ CLAMP
CLAMP value between 0.0 and 1.0.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
Key
PAL metadata keys.
std::optional< int64_t > getSMRDEncodedLiteralOffset32(const MCSubtargetInfo &ST, int64_t ByteOffset)
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isValid32BitLiteral(uint64_t Val, bool IsFP64)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
std::optional< int64_t > getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset, bool IsBuffer)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:751
@ STACKRESTORE
STACKRESTORE has two operands, an input chain and a pointer to restore to it returns an output chain.
Definition: ISDOpcodes.h:1133
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:251
@ ConstantFP
Definition: ISDOpcodes.h:77
@ ADDC
Carry-setting nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:270
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition: ISDOpcodes.h:488
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:240
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1038
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:484
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition: ISDOpcodes.h:199
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:791
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:391
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:904
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition: ISDOpcodes.h:230
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
Definition: ISDOpcodes.h:940
@ CONVERGENCECTRL_GLUE
Definition: ISDOpcodes.h:1403
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition: ISDOpcodes.h:621
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:931
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition: ISDOpcodes.h:501
@ UNDEF
UNDEF - An undefined node.
Definition: ISDOpcodes.h:212
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition: ISDOpcodes.h:209
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:706
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:536
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:203
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:781
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:972
@ TargetFrameIndex
Definition: ISDOpcodes.h:166
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:799
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:889
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:304
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:681
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:184
@ ADDE
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:280
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:870
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:787
@ BRCOND
BRCOND - Conditional branch.
Definition: ISDOpcodes.h:1077
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:192
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition: ISDOpcodes.h:516
bool isExtOpcode(unsigned Opcode)
Definition: ISDOpcodes.h:1606
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1535
@ Undef
Value of the register doesn't matter.
Reg
All possible values of the reg field in the ModR/M byte.
constexpr const char32_t SBase
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Offset
Definition: DWP.cpp:456
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
constexpr bool isMask_32(uint32_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition: MathExtras.h:251
bool isBoolSGPR(SDValue V)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:138
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
CodeGenOptLevel
Code generation optimization level.
Definition: CodeGen.h:54
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:143
FunctionPass * createAMDGPUISelDag(TargetMachine &TM, CodeGenOptLevel OptLevel)
This pass converts a legalized DAG into a AMDGPU-specific.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ Add
Sum of integers.
unsigned M0(unsigned Val)
Definition: VE.h:375
#define N
static const fltSemantics & IEEEhalf() LLVM_READNONE
Definition: APFloat.cpp:247
static const fltSemantics & BFloat() LLVM_READNONE
Definition: APFloat.cpp:248
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Description of the encoding of one expression Op.
Extended Value Type.
Definition: ValueTypes.h:34
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:358
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:370
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:306
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition: ValueTypes.h:246
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:318
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:156
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:326
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition: KnownBits.h:297
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition: KnownBits.h:141
static KnownBits computeForAddSub(bool Add, bool NSW, bool NUW, const KnownBits &LHS, const KnownBits &RHS)
Compute known bits resulting from adding LHS and RHS.
Definition: KnownBits.cpp:57
static unsigned getSubRegFromChannel(unsigned Channel)
This represents a list of ValueType's that has been intern'd by a SelectionDAG.