LLVM 19.0.0git
AMDGPUISelDAGToDAG.cpp
Go to the documentation of this file.
1//===-- AMDGPUISelDAGToDAG.cpp - A dag to dag inst selector for AMDGPU ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//==-----------------------------------------------------------------------===//
8//
9/// \file
10/// Defines an instruction selector for the AMDGPU target.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPUISelDAGToDAG.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPUSubtarget.h"
18#include "AMDGPUTargetMachine.h"
21#include "R600RegisterInfo.h"
22#include "SIISelLowering.h"
30#include "llvm/IR/IntrinsicsAMDGPU.h"
33
34#ifdef EXPENSIVE_CHECKS
36#include "llvm/IR/Dominators.h"
37#endif
38
39#define DEBUG_TYPE "amdgpu-isel"
40
41using namespace llvm;
42
43//===----------------------------------------------------------------------===//
44// Instruction Selector Implementation
45//===----------------------------------------------------------------------===//
46
47namespace {
48static SDValue stripBitcast(SDValue Val) {
49 return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val;
50}
51
52// Figure out if this is really an extract of the high 16-bits of a dword.
53static bool isExtractHiElt(SDValue In, SDValue &Out) {
54 In = stripBitcast(In);
55
56 if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
57 if (ConstantSDNode *Idx = dyn_cast<ConstantSDNode>(In.getOperand(1))) {
58 if (!Idx->isOne())
59 return false;
60 Out = In.getOperand(0);
61 return true;
62 }
63 }
64
65 if (In.getOpcode() != ISD::TRUNCATE)
66 return false;
67
68 SDValue Srl = In.getOperand(0);
69 if (Srl.getOpcode() == ISD::SRL) {
70 if (ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
71 if (ShiftAmt->getZExtValue() == 16) {
72 Out = stripBitcast(Srl.getOperand(0));
73 return true;
74 }
75 }
76 }
77
78 return false;
79}
80
81// Look through operations that obscure just looking at the low 16-bits of the
82// same register.
83static SDValue stripExtractLoElt(SDValue In) {
84 if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
85 SDValue Idx = In.getOperand(1);
86 if (isNullConstant(Idx) && In.getValueSizeInBits() <= 32)
87 return In.getOperand(0);
88 }
89
90 if (In.getOpcode() == ISD::TRUNCATE) {
91 SDValue Src = In.getOperand(0);
92 if (Src.getValueType().getSizeInBits() == 32)
93 return stripBitcast(Src);
94 }
95
96 return In;
97}
98
99} // end anonymous namespace
100
102 "AMDGPU DAG->DAG Pattern Instruction Selection", false,
103 false)
107#ifdef EXPENSIVE_CHECKS
110#endif
112 "AMDGPU DAG->DAG Pattern Instruction Selection", false,
113 false)
114
115/// This pass converts a legalized DAG into a AMDGPU-specific
116// DAG, ready for instruction scheduling.
118 CodeGenOptLevel OptLevel) {
119 return new AMDGPUDAGToDAGISelLegacy(TM, OptLevel);
120}
121
123 CodeGenOptLevel OptLevel)
124 : SelectionDAGISel(TM, OptLevel) {
125 EnableLateStructurizeCFG = AMDGPUTargetMachine::EnableLateStructurizeCFG;
126}
127
129 Subtarget = &MF.getSubtarget<GCNSubtarget>();
131 Mode = SIModeRegisterDefaults(MF.getFunction(), *Subtarget);
133}
134
135bool AMDGPUDAGToDAGISel::fp16SrcZerosHighBits(unsigned Opc) const {
136 // XXX - only need to list legal operations.
137 switch (Opc) {
138 case ISD::FADD:
139 case ISD::FSUB:
140 case ISD::FMUL:
141 case ISD::FDIV:
142 case ISD::FREM:
144 case ISD::UINT_TO_FP:
145 case ISD::SINT_TO_FP:
146 case ISD::FABS:
147 // Fabs is lowered to a bit operation, but it's an and which will clear the
148 // high bits anyway.
149 case ISD::FSQRT:
150 case ISD::FSIN:
151 case ISD::FCOS:
152 case ISD::FPOWI:
153 case ISD::FPOW:
154 case ISD::FLOG:
155 case ISD::FLOG2:
156 case ISD::FLOG10:
157 case ISD::FEXP:
158 case ISD::FEXP2:
159 case ISD::FCEIL:
160 case ISD::FTRUNC:
161 case ISD::FRINT:
162 case ISD::FNEARBYINT:
163 case ISD::FROUNDEVEN:
164 case ISD::FROUND:
165 case ISD::FFLOOR:
166 case ISD::FMINNUM:
167 case ISD::FMAXNUM:
168 case ISD::FLDEXP:
169 case AMDGPUISD::FRACT:
170 case AMDGPUISD::CLAMP:
173 case AMDGPUISD::FMIN3:
174 case AMDGPUISD::FMAX3:
175 case AMDGPUISD::FMED3:
177 case AMDGPUISD::RCP:
178 case AMDGPUISD::RSQ:
180 // On gfx10, all 16-bit instructions preserve the high bits.
181 return Subtarget->getGeneration() <= AMDGPUSubtarget::GFX9;
182 case ISD::FP_ROUND:
183 // We may select fptrunc (fma/mad) to mad_mixlo, which does not zero the
184 // high bits on gfx9.
185 // TODO: If we had the source node we could see if the source was fma/mad
187 case ISD::FMA:
188 case ISD::FMAD:
191 default:
192 // fcopysign, select and others may be lowered to 32-bit bit operations
193 // which don't zero the high bits.
194 return false;
195 }
196}
197
199#ifdef EXPENSIVE_CHECKS
200 DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
201 LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
202 for (auto &L : LI->getLoopsInPreorder()) {
203 assert(L->isLCSSAForm(DT));
204 }
205#endif
207}
208
212#ifdef EXPENSIVE_CHECKS
215#endif
217}
218
220 assert(Subtarget->d16PreservesUnusedBits());
221 MVT VT = N->getValueType(0).getSimpleVT();
222 if (VT != MVT::v2i16 && VT != MVT::v2f16)
223 return false;
224
225 SDValue Lo = N->getOperand(0);
226 SDValue Hi = N->getOperand(1);
227
228 LoadSDNode *LdHi = dyn_cast<LoadSDNode>(stripBitcast(Hi));
229
230 // build_vector lo, (load ptr) -> load_d16_hi ptr, lo
231 // build_vector lo, (zextload ptr from i8) -> load_d16_hi_u8 ptr, lo
232 // build_vector lo, (sextload ptr from i8) -> load_d16_hi_i8 ptr, lo
233
234 // Need to check for possible indirect dependencies on the other half of the
235 // vector to avoid introducing a cycle.
236 if (LdHi && Hi.hasOneUse() && !LdHi->isPredecessorOf(Lo.getNode())) {
237 SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
238
240 SDValue Ops[] = {
241 LdHi->getChain(), LdHi->getBasePtr(), TiedIn
242 };
243
244 unsigned LoadOp = AMDGPUISD::LOAD_D16_HI;
245 if (LdHi->getMemoryVT() == MVT::i8) {
246 LoadOp = LdHi->getExtensionType() == ISD::SEXTLOAD ?
248 } else {
249 assert(LdHi->getMemoryVT() == MVT::i16);
250 }
251
252 SDValue NewLoadHi =
253 CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdHi), VTList,
254 Ops, LdHi->getMemoryVT(),
255 LdHi->getMemOperand());
256
257 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadHi);
258 CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdHi, 1), NewLoadHi.getValue(1));
259 return true;
260 }
261
262 // build_vector (load ptr), hi -> load_d16_lo ptr, hi
263 // build_vector (zextload ptr from i8), hi -> load_d16_lo_u8 ptr, hi
264 // build_vector (sextload ptr from i8), hi -> load_d16_lo_i8 ptr, hi
265 LoadSDNode *LdLo = dyn_cast<LoadSDNode>(stripBitcast(Lo));
266 if (LdLo && Lo.hasOneUse()) {
267 SDValue TiedIn = getHi16Elt(Hi);
268 if (!TiedIn || LdLo->isPredecessorOf(TiedIn.getNode()))
269 return false;
270
271 SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
272 unsigned LoadOp = AMDGPUISD::LOAD_D16_LO;
273 if (LdLo->getMemoryVT() == MVT::i8) {
274 LoadOp = LdLo->getExtensionType() == ISD::SEXTLOAD ?
276 } else {
277 assert(LdLo->getMemoryVT() == MVT::i16);
278 }
279
280 TiedIn = CurDAG->getNode(ISD::BITCAST, SDLoc(N), VT, TiedIn);
281
282 SDValue Ops[] = {
283 LdLo->getChain(), LdLo->getBasePtr(), TiedIn
284 };
285
286 SDValue NewLoadLo =
287 CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdLo), VTList,
288 Ops, LdLo->getMemoryVT(),
289 LdLo->getMemOperand());
290
291 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadLo);
292 CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdLo, 1), NewLoadLo.getValue(1));
293 return true;
294 }
295
296 return false;
297}
298
300 if (!Subtarget->d16PreservesUnusedBits())
301 return;
302
304
305 bool MadeChange = false;
306 while (Position != CurDAG->allnodes_begin()) {
307 SDNode *N = &*--Position;
308 if (N->use_empty())
309 continue;
310
311 switch (N->getOpcode()) {
313 // TODO: Match load d16 from shl (extload:i16), 16
314 MadeChange |= matchLoadD16FromBuildVector(N);
315 break;
316 default:
317 break;
318 }
319 }
320
321 if (MadeChange) {
323 LLVM_DEBUG(dbgs() << "After PreProcess:\n";
324 CurDAG->dump(););
325 }
326}
327
328bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N) const {
329 if (N->isUndef())
330 return true;
331
332 const SIInstrInfo *TII = Subtarget->getInstrInfo();
333 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N))
334 return TII->isInlineConstant(C->getAPIntValue());
335
336 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N))
337 return TII->isInlineConstant(C->getValueAPF());
338
339 return false;
340}
341
342/// Determine the register class for \p OpNo
343/// \returns The register class of the virtual register that will be used for
344/// the given operand number \OpNo or NULL if the register class cannot be
345/// determined.
346const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
347 unsigned OpNo) const {
348 if (!N->isMachineOpcode()) {
349 if (N->getOpcode() == ISD::CopyToReg) {
350 Register Reg = cast<RegisterSDNode>(N->getOperand(1))->getReg();
351 if (Reg.isVirtual()) {
353 return MRI.getRegClass(Reg);
354 }
355
356 const SIRegisterInfo *TRI
357 = static_cast<const GCNSubtarget *>(Subtarget)->getRegisterInfo();
358 return TRI->getPhysRegBaseClass(Reg);
359 }
360
361 return nullptr;
362 }
363
364 switch (N->getMachineOpcode()) {
365 default: {
366 const MCInstrDesc &Desc =
367 Subtarget->getInstrInfo()->get(N->getMachineOpcode());
368 unsigned OpIdx = Desc.getNumDefs() + OpNo;
369 if (OpIdx >= Desc.getNumOperands())
370 return nullptr;
371 int RegClass = Desc.operands()[OpIdx].RegClass;
372 if (RegClass == -1)
373 return nullptr;
374
375 return Subtarget->getRegisterInfo()->getRegClass(RegClass);
376 }
377 case AMDGPU::REG_SEQUENCE: {
378 unsigned RCID = N->getConstantOperandVal(0);
379 const TargetRegisterClass *SuperRC =
380 Subtarget->getRegisterInfo()->getRegClass(RCID);
381
382 SDValue SubRegOp = N->getOperand(OpNo + 1);
383 unsigned SubRegIdx = SubRegOp->getAsZExtVal();
384 return Subtarget->getRegisterInfo()->getSubClassWithSubReg(SuperRC,
385 SubRegIdx);
386 }
387 }
388}
389
390SDNode *AMDGPUDAGToDAGISel::glueCopyToOp(SDNode *N, SDValue NewChain,
391 SDValue Glue) const {
392 SmallVector <SDValue, 8> Ops;
393 Ops.push_back(NewChain); // Replace the chain.
394 for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
395 Ops.push_back(N->getOperand(i));
396
397 Ops.push_back(Glue);
398 return CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops);
399}
400
401SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N, SDValue Val) const {
403 *static_cast<const SITargetLowering*>(getTargetLowering());
404
405 assert(N->getOperand(0).getValueType() == MVT::Other && "Expected chain");
406
407 SDValue M0 = Lowering.copyToM0(*CurDAG, N->getOperand(0), SDLoc(N), Val);
408 return glueCopyToOp(N, M0, M0.getValue(1));
409}
410
411SDNode *AMDGPUDAGToDAGISel::glueCopyToM0LDSInit(SDNode *N) const {
412 unsigned AS = cast<MemSDNode>(N)->getAddressSpace();
413 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
414 if (Subtarget->ldsRequiresM0Init())
415 return glueCopyToM0(N, CurDAG->getTargetConstant(-1, SDLoc(N), MVT::i32));
416 } else if (AS == AMDGPUAS::REGION_ADDRESS) {
418 unsigned Value = MF.getInfo<SIMachineFunctionInfo>()->getGDSSize();
419 return
420 glueCopyToM0(N, CurDAG->getTargetConstant(Value, SDLoc(N), MVT::i32));
421 }
422 return N;
423}
424
425MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm,
426 EVT VT) const {
428 AMDGPU::S_MOV_B32, DL, MVT::i32,
429 CurDAG->getTargetConstant(Imm & 0xFFFFFFFF, DL, MVT::i32));
430 SDNode *Hi =
431 CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
432 CurDAG->getTargetConstant(Imm >> 32, DL, MVT::i32));
433 const SDValue Ops[] = {
434 CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
435 SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
436 SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
437
438 return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, VT, Ops);
439}
440
441void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
442 EVT VT = N->getValueType(0);
443 unsigned NumVectorElts = VT.getVectorNumElements();
444 EVT EltVT = VT.getVectorElementType();
445 SDLoc DL(N);
446 SDValue RegClass = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
447
448 if (NumVectorElts == 1) {
449 CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT, N->getOperand(0),
450 RegClass);
451 return;
452 }
453
454 assert(NumVectorElts <= 32 && "Vectors with more than 32 elements not "
455 "supported yet");
456 // 32 = Max Num Vector Elements
457 // 2 = 2 REG_SEQUENCE operands per element (value, subreg index)
458 // 1 = Vector Register Class
459 SmallVector<SDValue, 32 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1);
460
461 bool IsGCN = CurDAG->getSubtarget().getTargetTriple().getArch() ==
463 RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
464 bool IsRegSeq = true;
465 unsigned NOps = N->getNumOperands();
466 for (unsigned i = 0; i < NOps; i++) {
467 // XXX: Why is this here?
468 if (isa<RegisterSDNode>(N->getOperand(i))) {
469 IsRegSeq = false;
470 break;
471 }
472 unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
474 RegSeqArgs[1 + (2 * i)] = N->getOperand(i);
475 RegSeqArgs[1 + (2 * i) + 1] = CurDAG->getTargetConstant(Sub, DL, MVT::i32);
476 }
477 if (NOps != NumVectorElts) {
478 // Fill in the missing undef elements if this was a scalar_to_vector.
479 assert(N->getOpcode() == ISD::SCALAR_TO_VECTOR && NOps < NumVectorElts);
480 MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
481 DL, EltVT);
482 for (unsigned i = NOps; i < NumVectorElts; ++i) {
483 unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
485 RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0);
486 RegSeqArgs[1 + (2 * i) + 1] =
487 CurDAG->getTargetConstant(Sub, DL, MVT::i32);
488 }
489 }
490
491 if (!IsRegSeq)
492 SelectCode(N);
493 CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), RegSeqArgs);
494}
495
497 unsigned int Opc = N->getOpcode();
498 if (N->isMachineOpcode()) {
499 N->setNodeId(-1);
500 return; // Already selected.
501 }
502
503 // isa<MemSDNode> almost works but is slightly too permissive for some DS
504 // intrinsics.
505 if (Opc == ISD::LOAD || Opc == ISD::STORE || isa<AtomicSDNode>(N) ||
508 N = glueCopyToM0LDSInit(N);
509 SelectCode(N);
510 return;
511 }
512
513 switch (Opc) {
514 default:
515 break;
516 // We are selecting i64 ADD here instead of custom lower it during
517 // DAG legalization, so we can fold some i64 ADDs used for address
518 // calculation into the LOAD and STORE instructions.
519 case ISD::ADDC:
520 case ISD::ADDE:
521 case ISD::SUBC:
522 case ISD::SUBE: {
523 if (N->getValueType(0) != MVT::i64)
524 break;
525
526 SelectADD_SUB_I64(N);
527 return;
528 }
529 case ISD::UADDO_CARRY:
530 case ISD::USUBO_CARRY:
531 if (N->getValueType(0) != MVT::i32)
532 break;
533
534 SelectAddcSubb(N);
535 return;
536 case ISD::UADDO:
537 case ISD::USUBO: {
538 SelectUADDO_USUBO(N);
539 return;
540 }
542 SelectFMUL_W_CHAIN(N);
543 return;
544 }
546 SelectFMA_W_CHAIN(N);
547 return;
548 }
549
551 case ISD::BUILD_VECTOR: {
552 EVT VT = N->getValueType(0);
553 unsigned NumVectorElts = VT.getVectorNumElements();
554 if (VT.getScalarSizeInBits() == 16) {
555 if (Opc == ISD::BUILD_VECTOR && NumVectorElts == 2) {
556 if (SDNode *Packed = packConstantV2I16(N, *CurDAG)) {
557 ReplaceNode(N, Packed);
558 return;
559 }
560 }
561
562 break;
563 }
564
565 assert(VT.getVectorElementType().bitsEq(MVT::i32));
566 unsigned RegClassID =
567 SIRegisterInfo::getSGPRClassForBitWidth(NumVectorElts * 32)->getID();
568 SelectBuildVector(N, RegClassID);
569 return;
570 }
571 case ISD::BUILD_PAIR: {
572 SDValue RC, SubReg0, SubReg1;
573 SDLoc DL(N);
574 if (N->getValueType(0) == MVT::i128) {
575 RC = CurDAG->getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32);
576 SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32);
577 SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32);
578 } else if (N->getValueType(0) == MVT::i64) {
579 RC = CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32);
580 SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
581 SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
582 } else {
583 llvm_unreachable("Unhandled value type for BUILD_PAIR");
584 }
585 const SDValue Ops[] = { RC, N->getOperand(0), SubReg0,
586 N->getOperand(1), SubReg1 };
587 ReplaceNode(N, CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL,
588 N->getValueType(0), Ops));
589 return;
590 }
591
592 case ISD::Constant:
593 case ISD::ConstantFP: {
594 if (N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N))
595 break;
596
597 uint64_t Imm;
598 if (ConstantFPSDNode *FP = dyn_cast<ConstantFPSDNode>(N)) {
599 Imm = FP->getValueAPF().bitcastToAPInt().getZExtValue();
600 if (AMDGPU::isValid32BitLiteral(Imm, true))
601 break;
602 } else {
603 ConstantSDNode *C = cast<ConstantSDNode>(N);
604 Imm = C->getZExtValue();
605 if (AMDGPU::isValid32BitLiteral(Imm, false))
606 break;
607 }
608
609 SDLoc DL(N);
610 ReplaceNode(N, buildSMovImm64(DL, Imm, N->getValueType(0)));
611 return;
612 }
614 case AMDGPUISD::BFE_U32: {
615 // There is a scalar version available, but unlike the vector version which
616 // has a separate operand for the offset and width, the scalar version packs
617 // the width and offset into a single operand. Try to move to the scalar
618 // version if the offsets are constant, so that we can try to keep extended
619 // loads of kernel arguments in SGPRs.
620
621 // TODO: Technically we could try to pattern match scalar bitshifts of
622 // dynamic values, but it's probably not useful.
623 ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
624 if (!Offset)
625 break;
626
627 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
628 if (!Width)
629 break;
630
631 bool Signed = Opc == AMDGPUISD::BFE_I32;
632
633 uint32_t OffsetVal = Offset->getZExtValue();
634 uint32_t WidthVal = Width->getZExtValue();
635
636 ReplaceNode(N, getBFE32(Signed, SDLoc(N), N->getOperand(0), OffsetVal,
637 WidthVal));
638 return;
639 }
641 SelectDIV_SCALE(N);
642 return;
643 }
646 SelectMAD_64_32(N);
647 return;
648 }
649 case ISD::SMUL_LOHI:
650 case ISD::UMUL_LOHI:
651 return SelectMUL_LOHI(N);
652 case ISD::CopyToReg: {
654 *static_cast<const SITargetLowering*>(getTargetLowering());
655 N = Lowering.legalizeTargetIndependentNode(N, *CurDAG);
656 break;
657 }
658 case ISD::AND:
659 case ISD::SRL:
660 case ISD::SRA:
662 if (N->getValueType(0) != MVT::i32)
663 break;
664
665 SelectS_BFE(N);
666 return;
667 case ISD::BRCOND:
668 SelectBRCOND(N);
669 return;
670 case ISD::FP_EXTEND:
671 SelectFP_EXTEND(N);
672 return;
678 // Hack around using a legal type if f16 is illegal.
679 if (N->getValueType(0) == MVT::i32) {
680 MVT NewVT = Opc == AMDGPUISD::CVT_PKRTZ_F16_F32 ? MVT::v2f16 : MVT::v2i16;
681 N = CurDAG->MorphNodeTo(N, N->getOpcode(), CurDAG->getVTList(NewVT),
682 { N->getOperand(0), N->getOperand(1) });
683 SelectCode(N);
684 return;
685 }
686
687 break;
688 }
690 SelectINTRINSIC_W_CHAIN(N);
691 return;
692 }
694 SelectINTRINSIC_WO_CHAIN(N);
695 return;
696 }
697 case ISD::INTRINSIC_VOID: {
698 SelectINTRINSIC_VOID(N);
699 return;
700 }
702 SelectWAVE_ADDRESS(N);
703 return;
704 }
705 case ISD::STACKRESTORE: {
706 SelectSTACKRESTORE(N);
707 return;
708 }
709 }
710
711 SelectCode(N);
712}
713
714bool AMDGPUDAGToDAGISel::isUniformBr(const SDNode *N) const {
715 const BasicBlock *BB = FuncInfo->MBB->getBasicBlock();
716 const Instruction *Term = BB->getTerminator();
717 return Term->getMetadata("amdgpu.uniform") ||
718 Term->getMetadata("structurizecfg.uniform");
719}
720
721bool AMDGPUDAGToDAGISel::isUnneededShiftMask(const SDNode *N,
722 unsigned ShAmtBits) const {
723 assert(N->getOpcode() == ISD::AND);
724
725 const APInt &RHS = N->getConstantOperandAPInt(1);
726 if (RHS.countr_one() >= ShAmtBits)
727 return true;
728
729 const APInt &LHSKnownZeros = CurDAG->computeKnownBits(N->getOperand(0)).Zero;
730 return (LHSKnownZeros | RHS).countr_one() >= ShAmtBits;
731}
732
734 SDValue &N0, SDValue &N1) {
735 if (Addr.getValueType() == MVT::i64 && Addr.getOpcode() == ISD::BITCAST &&
736 Addr.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
737 // As we split 64-bit `or` earlier, it's complicated pattern to match, i.e.
738 // (i64 (bitcast (v2i32 (build_vector
739 // (or (extract_vector_elt V, 0), OFFSET),
740 // (extract_vector_elt V, 1)))))
741 SDValue Lo = Addr.getOperand(0).getOperand(0);
742 if (Lo.getOpcode() == ISD::OR && DAG.isBaseWithConstantOffset(Lo)) {
743 SDValue BaseLo = Lo.getOperand(0);
744 SDValue BaseHi = Addr.getOperand(0).getOperand(1);
745 // Check that split base (Lo and Hi) are extracted from the same one.
746 if (BaseLo.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
748 BaseLo.getOperand(0) == BaseHi.getOperand(0) &&
749 // Lo is statically extracted from index 0.
750 isa<ConstantSDNode>(BaseLo.getOperand(1)) &&
751 BaseLo.getConstantOperandVal(1) == 0 &&
752 // Hi is statically extracted from index 0.
753 isa<ConstantSDNode>(BaseHi.getOperand(1)) &&
754 BaseHi.getConstantOperandVal(1) == 1) {
755 N0 = BaseLo.getOperand(0).getOperand(0);
756 N1 = Lo.getOperand(1);
757 return true;
758 }
759 }
760 }
761 return false;
762}
763
764bool AMDGPUDAGToDAGISel::isBaseWithConstantOffset64(SDValue Addr, SDValue &LHS,
765 SDValue &RHS) const {
767 LHS = Addr.getOperand(0);
768 RHS = Addr.getOperand(1);
769 return true;
770 }
771
772 if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, LHS, RHS)) {
773 assert(LHS && RHS && isa<ConstantSDNode>(RHS));
774 return true;
775 }
776
777 return false;
778}
779
781 return "AMDGPU DAG->DAG Pattern Instruction Selection";
782}
783
786 std::make_unique<AMDGPUDAGToDAGISel>(TM, TM.getOptLevel())) {}
787
791#ifdef EXPENSIVE_CHECKS
793 .getManager();
794 auto &F = MF.getFunction();
797 for (auto &L : LI.getLoopsInPreorder())
798 assert(L->isLCSSAForm(DT) && "Loop is not in LCSSA form!");
799#endif
800 return SelectionDAGISelPass::run(MF, MFAM);
801}
802
803//===----------------------------------------------------------------------===//
804// Complex Patterns
805//===----------------------------------------------------------------------===//
806
807bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
808 SDValue &Offset) {
809 return false;
810}
811
812bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
813 SDValue &Offset) {
815 SDLoc DL(Addr);
816
817 if ((C = dyn_cast<ConstantSDNode>(Addr))) {
818 Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
819 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
820 } else if ((Addr.getOpcode() == AMDGPUISD::DWORDADDR) &&
821 (C = dyn_cast<ConstantSDNode>(Addr.getOperand(0)))) {
822 Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
823 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
824 } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
825 (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) {
826 Base = Addr.getOperand(0);
827 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
828 } else {
829 Base = Addr;
830 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
831 }
832
833 return true;
834}
835
836SDValue AMDGPUDAGToDAGISel::getMaterializedScalarImm32(int64_t Val,
837 const SDLoc &DL) const {
839 AMDGPU::S_MOV_B32, DL, MVT::i32,
840 CurDAG->getTargetConstant(Val, DL, MVT::i32));
841 return SDValue(Mov, 0);
842}
843
844// FIXME: Should only handle uaddo_carry/usubo_carry
845void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
846 SDLoc DL(N);
847 SDValue LHS = N->getOperand(0);
848 SDValue RHS = N->getOperand(1);
849
850 unsigned Opcode = N->getOpcode();
851 bool ConsumeCarry = (Opcode == ISD::ADDE || Opcode == ISD::SUBE);
852 bool ProduceCarry =
853 ConsumeCarry || Opcode == ISD::ADDC || Opcode == ISD::SUBC;
854 bool IsAdd = Opcode == ISD::ADD || Opcode == ISD::ADDC || Opcode == ISD::ADDE;
855
856 SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
857 SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
858
859 SDNode *Lo0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
860 DL, MVT::i32, LHS, Sub0);
861 SDNode *Hi0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
862 DL, MVT::i32, LHS, Sub1);
863
864 SDNode *Lo1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
865 DL, MVT::i32, RHS, Sub0);
866 SDNode *Hi1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
867 DL, MVT::i32, RHS, Sub1);
868
869 SDVTList VTList = CurDAG->getVTList(MVT::i32, MVT::Glue);
870
871 static const unsigned OpcMap[2][2][2] = {
872 {{AMDGPU::S_SUB_U32, AMDGPU::S_ADD_U32},
873 {AMDGPU::V_SUB_CO_U32_e32, AMDGPU::V_ADD_CO_U32_e32}},
874 {{AMDGPU::S_SUBB_U32, AMDGPU::S_ADDC_U32},
875 {AMDGPU::V_SUBB_U32_e32, AMDGPU::V_ADDC_U32_e32}}};
876
877 unsigned Opc = OpcMap[0][N->isDivergent()][IsAdd];
878 unsigned CarryOpc = OpcMap[1][N->isDivergent()][IsAdd];
879
880 SDNode *AddLo;
881 if (!ConsumeCarry) {
882 SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0) };
883 AddLo = CurDAG->getMachineNode(Opc, DL, VTList, Args);
884 } else {
885 SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0), N->getOperand(2) };
886 AddLo = CurDAG->getMachineNode(CarryOpc, DL, VTList, Args);
887 }
888 SDValue AddHiArgs[] = {
889 SDValue(Hi0, 0),
890 SDValue(Hi1, 0),
891 SDValue(AddLo, 1)
892 };
893 SDNode *AddHi = CurDAG->getMachineNode(CarryOpc, DL, VTList, AddHiArgs);
894
895 SDValue RegSequenceArgs[] = {
896 CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
897 SDValue(AddLo,0),
898 Sub0,
899 SDValue(AddHi,0),
900 Sub1,
901 };
902 SDNode *RegSequence = CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
903 MVT::i64, RegSequenceArgs);
904
905 if (ProduceCarry) {
906 // Replace the carry-use
907 ReplaceUses(SDValue(N, 1), SDValue(AddHi, 1));
908 }
909
910 // Replace the remaining uses.
911 ReplaceNode(N, RegSequence);
912}
913
914void AMDGPUDAGToDAGISel::SelectAddcSubb(SDNode *N) {
915 SDLoc DL(N);
916 SDValue LHS = N->getOperand(0);
917 SDValue RHS = N->getOperand(1);
918 SDValue CI = N->getOperand(2);
919
920 if (N->isDivergent()) {
921 unsigned Opc = N->getOpcode() == ISD::UADDO_CARRY ? AMDGPU::V_ADDC_U32_e64
922 : AMDGPU::V_SUBB_U32_e64;
924 N, Opc, N->getVTList(),
925 {LHS, RHS, CI,
926 CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
927 } else {
928 unsigned Opc = N->getOpcode() == ISD::UADDO_CARRY ? AMDGPU::S_ADD_CO_PSEUDO
929 : AMDGPU::S_SUB_CO_PSEUDO;
930 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), {LHS, RHS, CI});
931 }
932}
933
934void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) {
935 // The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned
936 // carry out despite the _i32 name. These were renamed in VI to _U32.
937 // FIXME: We should probably rename the opcodes here.
938 bool IsAdd = N->getOpcode() == ISD::UADDO;
939 bool IsVALU = N->isDivergent();
940
941 for (SDNode::use_iterator UI = N->use_begin(), E = N->use_end(); UI != E;
942 ++UI)
943 if (UI.getUse().getResNo() == 1) {
944 if ((IsAdd && (UI->getOpcode() != ISD::UADDO_CARRY)) ||
945 (!IsAdd && (UI->getOpcode() != ISD::USUBO_CARRY))) {
946 IsVALU = true;
947 break;
948 }
949 }
950
951 if (IsVALU) {
952 unsigned Opc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
953
955 N, Opc, N->getVTList(),
956 {N->getOperand(0), N->getOperand(1),
957 CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
958 } else {
959 unsigned Opc = N->getOpcode() == ISD::UADDO ? AMDGPU::S_UADDO_PSEUDO
960 : AMDGPU::S_USUBO_PSEUDO;
961
962 CurDAG->SelectNodeTo(N, Opc, N->getVTList(),
963 {N->getOperand(0), N->getOperand(1)});
964 }
965}
966
967void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) {
968 SDLoc SL(N);
969 // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp, omod
970 SDValue Ops[10];
971
972 SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[6], Ops[7]);
973 SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
974 SelectVOP3Mods(N->getOperand(3), Ops[5], Ops[4]);
975 Ops[8] = N->getOperand(0);
976 Ops[9] = N->getOperand(4);
977
978 // If there are no source modifiers, prefer fmac over fma because it can use
979 // the smaller VOP2 encoding.
980 bool UseFMAC = Subtarget->hasDLInsts() &&
981 cast<ConstantSDNode>(Ops[0])->isZero() &&
982 cast<ConstantSDNode>(Ops[2])->isZero() &&
983 cast<ConstantSDNode>(Ops[4])->isZero();
984 unsigned Opcode = UseFMAC ? AMDGPU::V_FMAC_F32_e64 : AMDGPU::V_FMA_F32_e64;
985 CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), Ops);
986}
987
988void AMDGPUDAGToDAGISel::SelectFMUL_W_CHAIN(SDNode *N) {
989 SDLoc SL(N);
990 // src0_modifiers, src0, src1_modifiers, src1, clamp, omod
991 SDValue Ops[8];
992
993 SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[4], Ops[5]);
994 SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
995 Ops[6] = N->getOperand(0);
996 Ops[7] = N->getOperand(3);
997
998 CurDAG->SelectNodeTo(N, AMDGPU::V_MUL_F32_e64, N->getVTList(), Ops);
999}
1000
1001// We need to handle this here because tablegen doesn't support matching
1002// instructions with multiple outputs.
1003void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
1004 SDLoc SL(N);
1005 EVT VT = N->getValueType(0);
1006
1007 assert(VT == MVT::f32 || VT == MVT::f64);
1008
1009 unsigned Opc
1010 = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64_e64 : AMDGPU::V_DIV_SCALE_F32_e64;
1011
1012 // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp,
1013 // omod
1014 SDValue Ops[8];
1015 SelectVOP3BMods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]);
1016 SelectVOP3BMods(N->getOperand(1), Ops[3], Ops[2]);
1017 SelectVOP3BMods(N->getOperand(2), Ops[5], Ops[4]);
1018 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
1019}
1020
1021// We need to handle this here because tablegen doesn't support matching
1022// instructions with multiple outputs.
1023void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {
1024 SDLoc SL(N);
1025 bool Signed = N->getOpcode() == AMDGPUISD::MAD_I64_I32;
1026 unsigned Opc;
1027 if (Subtarget->hasMADIntraFwdBug())
1028 Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
1029 : AMDGPU::V_MAD_U64_U32_gfx11_e64;
1030 else
1031 Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
1032
1033 SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
1034 SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
1035 Clamp };
1036 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
1037}
1038
1039// We need to handle this here because tablegen doesn't support matching
1040// instructions with multiple outputs.
1041void AMDGPUDAGToDAGISel::SelectMUL_LOHI(SDNode *N) {
1042 SDLoc SL(N);
1043 bool Signed = N->getOpcode() == ISD::SMUL_LOHI;
1044 unsigned Opc;
1045 if (Subtarget->hasMADIntraFwdBug())
1046 Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
1047 : AMDGPU::V_MAD_U64_U32_gfx11_e64;
1048 else
1049 Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
1050
1051 SDValue Zero = CurDAG->getTargetConstant(0, SL, MVT::i64);
1052 SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
1053 SDValue Ops[] = {N->getOperand(0), N->getOperand(1), Zero, Clamp};
1054 SDNode *Mad = CurDAG->getMachineNode(Opc, SL, N->getVTList(), Ops);
1055 if (!SDValue(N, 0).use_empty()) {
1056 SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32);
1057 SDNode *Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL,
1058 MVT::i32, SDValue(Mad, 0), Sub0);
1059 ReplaceUses(SDValue(N, 0), SDValue(Lo, 0));
1060 }
1061 if (!SDValue(N, 1).use_empty()) {
1062 SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32);
1063 SDNode *Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL,
1064 MVT::i32, SDValue(Mad, 0), Sub1);
1065 ReplaceUses(SDValue(N, 1), SDValue(Hi, 0));
1066 }
1068}
1069
1070bool AMDGPUDAGToDAGISel::isDSOffsetLegal(SDValue Base, unsigned Offset) const {
1071 if (!isUInt<16>(Offset))
1072 return false;
1073
1074 if (!Base || Subtarget->hasUsableDSOffset() ||
1075 Subtarget->unsafeDSOffsetFoldingEnabled())
1076 return true;
1077
1078 // On Southern Islands instruction with a negative base value and an offset
1079 // don't seem to work.
1080 return CurDAG->SignBitIsZero(Base);
1081}
1082
1083bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
1084 SDValue &Offset) const {
1085 SDLoc DL(Addr);
1087 SDValue N0 = Addr.getOperand(0);
1088 SDValue N1 = Addr.getOperand(1);
1089 ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1090 if (isDSOffsetLegal(N0, C1->getSExtValue())) {
1091 // (add n0, c0)
1092 Base = N0;
1093 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
1094 return true;
1095 }
1096 } else if (Addr.getOpcode() == ISD::SUB) {
1097 // sub C, x -> add (sub 0, x), C
1098 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
1099 int64_t ByteOffset = C->getSExtValue();
1100 if (isDSOffsetLegal(SDValue(), ByteOffset)) {
1101 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1102
1103 // XXX - This is kind of hacky. Create a dummy sub node so we can check
1104 // the known bits in isDSOffsetLegal. We need to emit the selected node
1105 // here, so this is thrown away.
1106 SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32,
1107 Zero, Addr.getOperand(1));
1108
1109 if (isDSOffsetLegal(Sub, ByteOffset)) {
1111 Opnds.push_back(Zero);
1112 Opnds.push_back(Addr.getOperand(1));
1113
1114 // FIXME: Select to VOP3 version for with-carry.
1115 unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1116 if (Subtarget->hasAddNoCarry()) {
1117 SubOp = AMDGPU::V_SUB_U32_e64;
1118 Opnds.push_back(
1119 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
1120 }
1121
1122 MachineSDNode *MachineSub =
1123 CurDAG->getMachineNode(SubOp, DL, MVT::i32, Opnds);
1124
1125 Base = SDValue(MachineSub, 0);
1126 Offset = CurDAG->getTargetConstant(ByteOffset, DL, MVT::i16);
1127 return true;
1128 }
1129 }
1130 }
1131 } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1132 // If we have a constant address, prefer to put the constant into the
1133 // offset. This can save moves to load the constant address since multiple
1134 // operations can share the zero base address register, and enables merging
1135 // into read2 / write2 instructions.
1136
1137 SDLoc DL(Addr);
1138
1139 if (isDSOffsetLegal(SDValue(), CAddr->getZExtValue())) {
1140 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1141 MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
1142 DL, MVT::i32, Zero);
1143 Base = SDValue(MovZero, 0);
1144 Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16);
1145 return true;
1146 }
1147 }
1148
1149 // default case
1150 Base = Addr;
1151 Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i16);
1152 return true;
1153}
1154
1155bool AMDGPUDAGToDAGISel::isDSOffset2Legal(SDValue Base, unsigned Offset0,
1156 unsigned Offset1,
1157 unsigned Size) const {
1158 if (Offset0 % Size != 0 || Offset1 % Size != 0)
1159 return false;
1160 if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
1161 return false;
1162
1163 if (!Base || Subtarget->hasUsableDSOffset() ||
1164 Subtarget->unsafeDSOffsetFoldingEnabled())
1165 return true;
1166
1167 // On Southern Islands instruction with a negative base value and an offset
1168 // don't seem to work.
1169 return CurDAG->SignBitIsZero(Base);
1170}
1171
1172// Return whether the operation has NoUnsignedWrap property.
1174 return (Addr.getOpcode() == ISD::ADD &&
1175 Addr->getFlags().hasNoUnsignedWrap()) ||
1176 Addr->getOpcode() == ISD::OR;
1177}
1178
1179// Check that the base address of flat scratch load/store in the form of `base +
1180// offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware
1181// requirement). We always treat the first operand as the base address here.
1182bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue Addr) const {
1184 return true;
1185
1186 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1187 // values.
1188 if (Subtarget->hasSignedScratchOffsets())
1189 return true;
1190
1191 auto LHS = Addr.getOperand(0);
1192 auto RHS = Addr.getOperand(1);
1193
1194 // If the immediate offset is negative and within certain range, the base
1195 // address cannot also be negative. If the base is also negative, the sum
1196 // would be either negative or much larger than the valid range of scratch
1197 // memory a thread can access.
1198 ConstantSDNode *ImmOp = nullptr;
1199 if (Addr.getOpcode() == ISD::ADD && (ImmOp = dyn_cast<ConstantSDNode>(RHS))) {
1200 if (ImmOp->getSExtValue() < 0 && ImmOp->getSExtValue() > -0x40000000)
1201 return true;
1202 }
1203
1204 return CurDAG->SignBitIsZero(LHS);
1205}
1206
1207// Check address value in SGPR/VGPR are legal for flat scratch in the form
1208// of: SGPR + VGPR.
1209bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSV(SDValue Addr) const {
1211 return true;
1212
1213 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1214 // values.
1215 if (Subtarget->hasSignedScratchOffsets())
1216 return true;
1217
1218 auto LHS = Addr.getOperand(0);
1219 auto RHS = Addr.getOperand(1);
1220 return CurDAG->SignBitIsZero(RHS) && CurDAG->SignBitIsZero(LHS);
1221}
1222
1223// Check address value in SGPR/VGPR are legal for flat scratch in the form
1224// of: SGPR + VGPR + Imm.
1225bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSVImm(SDValue Addr) const {
1226 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1227 // values.
1228 if (AMDGPU::isGFX12Plus(*Subtarget))
1229 return true;
1230
1231 auto Base = Addr.getOperand(0);
1232 auto *RHSImm = cast<ConstantSDNode>(Addr.getOperand(1));
1233 // If the immediate offset is negative and within certain range, the base
1234 // address cannot also be negative. If the base is also negative, the sum
1235 // would be either negative or much larger than the valid range of scratch
1236 // memory a thread can access.
1237 if (isNoUnsignedWrap(Base) &&
1239 (RHSImm->getSExtValue() < 0 && RHSImm->getSExtValue() > -0x40000000)))
1240 return true;
1241
1242 auto LHS = Base.getOperand(0);
1243 auto RHS = Base.getOperand(1);
1244 return CurDAG->SignBitIsZero(RHS) && CurDAG->SignBitIsZero(LHS);
1245}
1246
1247// TODO: If offset is too big, put low 16-bit into offset.
1248bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,
1249 SDValue &Offset0,
1250 SDValue &Offset1) const {
1251 return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 4);
1252}
1253
1254bool AMDGPUDAGToDAGISel::SelectDS128Bit8ByteAligned(SDValue Addr, SDValue &Base,
1255 SDValue &Offset0,
1256 SDValue &Offset1) const {
1257 return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 8);
1258}
1259
1260bool AMDGPUDAGToDAGISel::SelectDSReadWrite2(SDValue Addr, SDValue &Base,
1261 SDValue &Offset0, SDValue &Offset1,
1262 unsigned Size) const {
1263 SDLoc DL(Addr);
1264
1266 SDValue N0 = Addr.getOperand(0);
1267 SDValue N1 = Addr.getOperand(1);
1268 ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1269 unsigned OffsetValue0 = C1->getZExtValue();
1270 unsigned OffsetValue1 = OffsetValue0 + Size;
1271
1272 // (add n0, c0)
1273 if (isDSOffset2Legal(N0, OffsetValue0, OffsetValue1, Size)) {
1274 Base = N0;
1275 Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i8);
1276 Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i8);
1277 return true;
1278 }
1279 } else if (Addr.getOpcode() == ISD::SUB) {
1280 // sub C, x -> add (sub 0, x), C
1281 if (const ConstantSDNode *C =
1282 dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
1283 unsigned OffsetValue0 = C->getZExtValue();
1284 unsigned OffsetValue1 = OffsetValue0 + Size;
1285
1286 if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {
1287 SDLoc DL(Addr);
1288 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1289
1290 // XXX - This is kind of hacky. Create a dummy sub node so we can check
1291 // the known bits in isDSOffsetLegal. We need to emit the selected node
1292 // here, so this is thrown away.
1293 SDValue Sub =
1294 CurDAG->getNode(ISD::SUB, DL, MVT::i32, Zero, Addr.getOperand(1));
1295
1296 if (isDSOffset2Legal(Sub, OffsetValue0, OffsetValue1, Size)) {
1298 Opnds.push_back(Zero);
1299 Opnds.push_back(Addr.getOperand(1));
1300 unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1301 if (Subtarget->hasAddNoCarry()) {
1302 SubOp = AMDGPU::V_SUB_U32_e64;
1303 Opnds.push_back(
1304 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
1305 }
1306
1307 MachineSDNode *MachineSub = CurDAG->getMachineNode(
1308 SubOp, DL, MVT::getIntegerVT(Size * 8), Opnds);
1309
1310 Base = SDValue(MachineSub, 0);
1311 Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i8);
1312 Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i8);
1313 return true;
1314 }
1315 }
1316 }
1317 } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1318 unsigned OffsetValue0 = CAddr->getZExtValue();
1319 unsigned OffsetValue1 = OffsetValue0 + Size;
1320
1321 if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {
1322 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1323 MachineSDNode *MovZero =
1324 CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32, Zero);
1325 Base = SDValue(MovZero, 0);
1326 Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i8);
1327 Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i8);
1328 return true;
1329 }
1330 }
1331
1332 // default case
1333
1334 Base = Addr;
1335 Offset0 = CurDAG->getTargetConstant(0, DL, MVT::i8);
1336 Offset1 = CurDAG->getTargetConstant(1, DL, MVT::i8);
1337 return true;
1338}
1339
1340bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDValue &VAddr,
1341 SDValue &SOffset, SDValue &Offset,
1342 SDValue &Offen, SDValue &Idxen,
1343 SDValue &Addr64) const {
1344 // Subtarget prefers to use flat instruction
1345 // FIXME: This should be a pattern predicate and not reach here
1346 if (Subtarget->useFlatForGlobal())
1347 return false;
1348
1349 SDLoc DL(Addr);
1350
1351 Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1);
1352 Offen = CurDAG->getTargetConstant(0, DL, MVT::i1);
1353 Addr64 = CurDAG->getTargetConstant(0, DL, MVT::i1);
1354 SOffset = Subtarget->hasRestrictedSOffset()
1355 ? CurDAG->getRegister(AMDGPU::SGPR_NULL, MVT::i32)
1356 : CurDAG->getTargetConstant(0, DL, MVT::i32);
1357
1358 ConstantSDNode *C1 = nullptr;
1359 SDValue N0 = Addr;
1361 C1 = cast<ConstantSDNode>(Addr.getOperand(1));
1362 if (isUInt<32>(C1->getZExtValue()))
1363 N0 = Addr.getOperand(0);
1364 else
1365 C1 = nullptr;
1366 }
1367
1368 if (N0.getOpcode() == ISD::ADD) {
1369 // (add N2, N3) -> addr64, or
1370 // (add (add N2, N3), C1) -> addr64
1371 SDValue N2 = N0.getOperand(0);
1372 SDValue N3 = N0.getOperand(1);
1373 Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
1374
1375 if (N2->isDivergent()) {
1376 if (N3->isDivergent()) {
1377 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
1378 // addr64, and construct the resource from a 0 address.
1379 Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
1380 VAddr = N0;
1381 } else {
1382 // N2 is divergent, N3 is not.
1383 Ptr = N3;
1384 VAddr = N2;
1385 }
1386 } else {
1387 // N2 is not divergent.
1388 Ptr = N2;
1389 VAddr = N3;
1390 }
1391 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1392 } else if (N0->isDivergent()) {
1393 // N0 is divergent. Use it as the addr64, and construct the resource from a
1394 // 0 address.
1395 Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
1396 VAddr = N0;
1397 Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
1398 } else {
1399 // N0 -> offset, or
1400 // (N0 + C1) -> offset
1401 VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32);
1402 Ptr = N0;
1403 }
1404
1405 if (!C1) {
1406 // No offset.
1407 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1408 return true;
1409 }
1410
1411 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1412 if (TII->isLegalMUBUFImmOffset(C1->getZExtValue())) {
1413 // Legal offset for instruction.
1414 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
1415 return true;
1416 }
1417
1418 // Illegal offset, store it in soffset.
1419 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1420 SOffset =
1422 AMDGPU::S_MOV_B32, DL, MVT::i32,
1423 CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32)),
1424 0);
1425 return true;
1426}
1427
1428bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
1429 SDValue &VAddr, SDValue &SOffset,
1430 SDValue &Offset) const {
1431 SDValue Ptr, Offen, Idxen, Addr64;
1432
1433 // addr64 bit was removed for volcanic islands.
1434 // FIXME: This should be a pattern predicate and not reach here
1435 if (!Subtarget->hasAddr64())
1436 return false;
1437
1438 if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
1439 return false;
1440
1441 ConstantSDNode *C = cast<ConstantSDNode>(Addr64);
1442 if (C->getSExtValue()) {
1443 SDLoc DL(Addr);
1444
1445 const SITargetLowering& Lowering =
1446 *static_cast<const SITargetLowering*>(getTargetLowering());
1447
1448 SRsrc = SDValue(Lowering.wrapAddr64Rsrc(*CurDAG, DL, Ptr), 0);
1449 return true;
1450 }
1451
1452 return false;
1453}
1454
1455std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const {
1456 SDLoc DL(N);
1457
1458 auto *FI = dyn_cast<FrameIndexSDNode>(N);
1459 SDValue TFI =
1460 FI ? CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0)) : N;
1461
1462 // We rebase the base address into an absolute stack address and hence
1463 // use constant 0 for soffset. This value must be retained until
1464 // frame elimination and eliminateFrameIndex will choose the appropriate
1465 // frame register if need be.
1466 return std::pair(TFI, CurDAG->getTargetConstant(0, DL, MVT::i32));
1467}
1468
1469bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent,
1470 SDValue Addr, SDValue &Rsrc,
1471 SDValue &VAddr, SDValue &SOffset,
1472 SDValue &ImmOffset) const {
1473
1474 SDLoc DL(Addr);
1477
1478 Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1479
1480 if (ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1481 int64_t Imm = CAddr->getSExtValue();
1482 const int64_t NullPtr =
1484 // Don't fold null pointer.
1485 if (Imm != NullPtr) {
1486 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
1487 SDValue HighBits =
1488 CurDAG->getTargetConstant(Imm & ~MaxOffset, DL, MVT::i32);
1489 MachineSDNode *MovHighBits = CurDAG->getMachineNode(
1490 AMDGPU::V_MOV_B32_e32, DL, MVT::i32, HighBits);
1491 VAddr = SDValue(MovHighBits, 0);
1492
1493 SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1494 ImmOffset = CurDAG->getTargetConstant(Imm & MaxOffset, DL, MVT::i32);
1495 return true;
1496 }
1497 }
1498
1500 // (add n0, c1)
1501
1502 SDValue N0 = Addr.getOperand(0);
1503 uint64_t C1 = Addr.getConstantOperandVal(1);
1504
1505 // Offsets in vaddr must be positive if range checking is enabled.
1506 //
1507 // The total computation of vaddr + soffset + offset must not overflow. If
1508 // vaddr is negative, even if offset is 0 the sgpr offset add will end up
1509 // overflowing.
1510 //
1511 // Prior to gfx9, MUBUF instructions with the vaddr offset enabled would
1512 // always perform a range check. If a negative vaddr base index was used,
1513 // this would fail the range check. The overall address computation would
1514 // compute a valid address, but this doesn't happen due to the range
1515 // check. For out-of-bounds MUBUF loads, a 0 is returned.
1516 //
1517 // Therefore it should be safe to fold any VGPR offset on gfx9 into the
1518 // MUBUF vaddr, but not on older subtargets which can only do this if the
1519 // sign bit is known 0.
1520 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1521 if (TII->isLegalMUBUFImmOffset(C1) &&
1522 (!Subtarget->privateMemoryResourceIsRangeChecked() ||
1523 CurDAG->SignBitIsZero(N0))) {
1524 std::tie(VAddr, SOffset) = foldFrameIndex(N0);
1525 ImmOffset = CurDAG->getTargetConstant(C1, DL, MVT::i32);
1526 return true;
1527 }
1528 }
1529
1530 // (node)
1531 std::tie(VAddr, SOffset) = foldFrameIndex(Addr);
1532 ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1533 return true;
1534}
1535
1536static bool IsCopyFromSGPR(const SIRegisterInfo &TRI, SDValue Val) {
1537 if (Val.getOpcode() != ISD::CopyFromReg)
1538 return false;
1539 auto Reg = cast<RegisterSDNode>(Val.getOperand(1))->getReg();
1540 if (!Reg.isPhysical())
1541 return false;
1542 auto RC = TRI.getPhysRegBaseClass(Reg);
1543 return RC && TRI.isSGPRClass(RC);
1544}
1545
1546bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent,
1547 SDValue Addr,
1548 SDValue &SRsrc,
1549 SDValue &SOffset,
1550 SDValue &Offset) const {
1551 const SIRegisterInfo *TRI =
1552 static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
1553 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1556 SDLoc DL(Addr);
1557
1558 // CopyFromReg <sgpr>
1559 if (IsCopyFromSGPR(*TRI, Addr)) {
1560 SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1561 SOffset = Addr;
1562 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1563 return true;
1564 }
1565
1566 ConstantSDNode *CAddr;
1567 if (Addr.getOpcode() == ISD::ADD) {
1568 // Add (CopyFromReg <sgpr>) <constant>
1569 CAddr = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
1570 if (!CAddr || !TII->isLegalMUBUFImmOffset(CAddr->getZExtValue()))
1571 return false;
1572 if (!IsCopyFromSGPR(*TRI, Addr.getOperand(0)))
1573 return false;
1574
1575 SOffset = Addr.getOperand(0);
1576 } else if ((CAddr = dyn_cast<ConstantSDNode>(Addr)) &&
1577 TII->isLegalMUBUFImmOffset(CAddr->getZExtValue())) {
1578 // <constant>
1579 SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1580 } else {
1581 return false;
1582 }
1583
1584 SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1585
1586 Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i32);
1587 return true;
1588}
1589
1590bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
1591 SDValue &SOffset, SDValue &Offset
1592 ) const {
1593 SDValue Ptr, VAddr, Offen, Idxen, Addr64;
1594 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1595
1596 if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
1597 return false;
1598
1599 if (!cast<ConstantSDNode>(Offen)->getSExtValue() &&
1600 !cast<ConstantSDNode>(Idxen)->getSExtValue() &&
1601 !cast<ConstantSDNode>(Addr64)->getSExtValue()) {
1602 uint64_t Rsrc = TII->getDefaultRsrcDataFormat() |
1603 APInt::getAllOnes(32).getZExtValue(); // Size
1604 SDLoc DL(Addr);
1605
1606 const SITargetLowering& Lowering =
1607 *static_cast<const SITargetLowering*>(getTargetLowering());
1608
1609 SRsrc = SDValue(Lowering.buildRSRC(*CurDAG, DL, Ptr, 0, Rsrc), 0);
1610 return true;
1611 }
1612 return false;
1613}
1614
1615bool AMDGPUDAGToDAGISel::SelectBUFSOffset(SDValue ByteOffsetNode,
1616 SDValue &SOffset) const {
1617 if (Subtarget->hasRestrictedSOffset() && isNullConstant(ByteOffsetNode)) {
1618 SOffset = CurDAG->getRegister(AMDGPU::SGPR_NULL, MVT::i32);
1619 return true;
1620 }
1621
1622 SOffset = ByteOffsetNode;
1623 return true;
1624}
1625
1626// Find a load or store from corresponding pattern root.
1627// Roots may be build_vector, bitconvert or their combinations.
1630 if (MemSDNode *MN = dyn_cast<MemSDNode>(N))
1631 return MN;
1632 assert(isa<BuildVectorSDNode>(N));
1633 for (SDValue V : N->op_values())
1634 if (MemSDNode *MN =
1635 dyn_cast<MemSDNode>(AMDGPUTargetLowering::stripBitcast(V)))
1636 return MN;
1637 llvm_unreachable("cannot find MemSDNode in the pattern!");
1638}
1639
1640bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr,
1641 SDValue &VAddr, SDValue &Offset,
1642 uint64_t FlatVariant) const {
1643 int64_t OffsetVal = 0;
1644
1645 unsigned AS = findMemSDNode(N)->getAddressSpace();
1646
1647 bool CanHaveFlatSegmentOffsetBug =
1648 Subtarget->hasFlatSegmentOffsetBug() &&
1649 FlatVariant == SIInstrFlags::FLAT &&
1651
1652 if (Subtarget->hasFlatInstOffsets() && !CanHaveFlatSegmentOffsetBug) {
1653 SDValue N0, N1;
1654 if (isBaseWithConstantOffset64(Addr, N0, N1) &&
1655 (FlatVariant != SIInstrFlags::FlatScratch ||
1656 isFlatScratchBaseLegal(Addr))) {
1657 int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
1658
1659 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1660 if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) {
1661 Addr = N0;
1662 OffsetVal = COffsetVal;
1663 } else {
1664 // If the offset doesn't fit, put the low bits into the offset field and
1665 // add the rest.
1666 //
1667 // For a FLAT instruction the hardware decides whether to access
1668 // global/scratch/shared memory based on the high bits of vaddr,
1669 // ignoring the offset field, so we have to ensure that when we add
1670 // remainder to vaddr it still points into the same underlying object.
1671 // The easiest way to do that is to make sure that we split the offset
1672 // into two pieces that are both >= 0 or both <= 0.
1673
1674 SDLoc DL(N);
1675 uint64_t RemainderOffset;
1676
1677 std::tie(OffsetVal, RemainderOffset) =
1678 TII->splitFlatOffset(COffsetVal, AS, FlatVariant);
1679
1680 SDValue AddOffsetLo =
1681 getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
1682 SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
1683
1684 if (Addr.getValueType().getSizeInBits() == 32) {
1686 Opnds.push_back(N0);
1687 Opnds.push_back(AddOffsetLo);
1688 unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32;
1689 if (Subtarget->hasAddNoCarry()) {
1690 AddOp = AMDGPU::V_ADD_U32_e64;
1691 Opnds.push_back(Clamp);
1692 }
1693 Addr = SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0);
1694 } else {
1695 // TODO: Should this try to use a scalar add pseudo if the base address
1696 // is uniform and saddr is usable?
1697 SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
1698 SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
1699
1700 SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1701 DL, MVT::i32, N0, Sub0);
1702 SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1703 DL, MVT::i32, N0, Sub1);
1704
1705 SDValue AddOffsetHi =
1706 getMaterializedScalarImm32(Hi_32(RemainderOffset), DL);
1707
1708 SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1);
1709
1710 SDNode *Add =
1711 CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs,
1712 {AddOffsetLo, SDValue(N0Lo, 0), Clamp});
1713
1714 SDNode *Addc = CurDAG->getMachineNode(
1715 AMDGPU::V_ADDC_U32_e64, DL, VTs,
1716 {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp});
1717
1718 SDValue RegSequenceArgs[] = {
1719 CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32),
1720 SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1};
1721
1722 Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
1723 MVT::i64, RegSequenceArgs),
1724 0);
1725 }
1726 }
1727 }
1728 }
1729
1730 VAddr = Addr;
1731 Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i32);
1732 return true;
1733}
1734
1735bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N, SDValue Addr,
1736 SDValue &VAddr,
1737 SDValue &Offset) const {
1738 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, SIInstrFlags::FLAT);
1739}
1740
1741bool AMDGPUDAGToDAGISel::SelectGlobalOffset(SDNode *N, SDValue Addr,
1742 SDValue &VAddr,
1743 SDValue &Offset) const {
1744 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, SIInstrFlags::FlatGlobal);
1745}
1746
1747bool AMDGPUDAGToDAGISel::SelectScratchOffset(SDNode *N, SDValue Addr,
1748 SDValue &VAddr,
1749 SDValue &Offset) const {
1750 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset,
1752}
1753
1754// If this matches zero_extend i32:x, return x
1756 if (Op.getOpcode() != ISD::ZERO_EXTEND)
1757 return SDValue();
1758
1759 SDValue ExtSrc = Op.getOperand(0);
1760 return (ExtSrc.getValueType() == MVT::i32) ? ExtSrc : SDValue();
1761}
1762
1763// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
1764bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
1765 SDValue Addr,
1766 SDValue &SAddr,
1767 SDValue &VOffset,
1768 SDValue &Offset) const {
1769 int64_t ImmOffset = 0;
1770
1771 // Match the immediate offset first, which canonically is moved as low as
1772 // possible.
1773
1774 SDValue LHS, RHS;
1775 if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
1776 int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
1777 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1778
1779 if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS,
1781 Addr = LHS;
1782 ImmOffset = COffsetVal;
1783 } else if (!LHS->isDivergent()) {
1784 if (COffsetVal > 0) {
1785 SDLoc SL(N);
1786 // saddr + large_offset -> saddr +
1787 // (voffset = large_offset & ~MaxOffset) +
1788 // (large_offset & MaxOffset);
1789 int64_t SplitImmOffset, RemainderOffset;
1790 std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
1792
1793 if (isUInt<32>(RemainderOffset)) {
1794 SDNode *VMov = CurDAG->getMachineNode(
1795 AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
1796 CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
1797 VOffset = SDValue(VMov, 0);
1798 SAddr = LHS;
1799 Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i32);
1800 return true;
1801 }
1802 }
1803
1804 // We are adding a 64 bit SGPR and a constant. If constant bus limit
1805 // is 1 we would need to perform 1 or 2 extra moves for each half of
1806 // the constant and it is better to do a scalar add and then issue a
1807 // single VALU instruction to materialize zero. Otherwise it is less
1808 // instructions to perform VALU adds with immediates or inline literals.
1809 unsigned NumLiterals =
1810 !TII->isInlineConstant(APInt(32, COffsetVal & 0xffffffff)) +
1811 !TII->isInlineConstant(APInt(32, COffsetVal >> 32));
1812 if (Subtarget->getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
1813 return false;
1814 }
1815 }
1816
1817 // Match the variable offset.
1818 if (Addr.getOpcode() == ISD::ADD) {
1819 LHS = Addr.getOperand(0);
1820 RHS = Addr.getOperand(1);
1821
1822 if (!LHS->isDivergent()) {
1823 // add (i64 sgpr), (zero_extend (i32 vgpr))
1824 if (SDValue ZextRHS = matchZExtFromI32(RHS)) {
1825 SAddr = LHS;
1826 VOffset = ZextRHS;
1827 }
1828 }
1829
1830 if (!SAddr && !RHS->isDivergent()) {
1831 // add (zero_extend (i32 vgpr)), (i64 sgpr)
1832 if (SDValue ZextLHS = matchZExtFromI32(LHS)) {
1833 SAddr = RHS;
1834 VOffset = ZextLHS;
1835 }
1836 }
1837
1838 if (SAddr) {
1839 Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i32);
1840 return true;
1841 }
1842 }
1843
1844 if (Addr->isDivergent() || Addr.getOpcode() == ISD::UNDEF ||
1845 isa<ConstantSDNode>(Addr))
1846 return false;
1847
1848 // It's cheaper to materialize a single 32-bit zero for vaddr than the two
1849 // moves required to copy a 64-bit SGPR to VGPR.
1850 SAddr = Addr;
1851 SDNode *VMov =
1852 CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, SDLoc(Addr), MVT::i32,
1853 CurDAG->getTargetConstant(0, SDLoc(), MVT::i32));
1854 VOffset = SDValue(VMov, 0);
1855 Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i32);
1856 return true;
1857}
1858
1860 if (auto FI = dyn_cast<FrameIndexSDNode>(SAddr)) {
1861 SAddr = CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0));
1862 } else if (SAddr.getOpcode() == ISD::ADD &&
1863 isa<FrameIndexSDNode>(SAddr.getOperand(0))) {
1864 // Materialize this into a scalar move for scalar address to avoid
1865 // readfirstlane.
1866 auto FI = cast<FrameIndexSDNode>(SAddr.getOperand(0));
1867 SDValue TFI = CurDAG->getTargetFrameIndex(FI->getIndex(),
1868 FI->getValueType(0));
1869 SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, SDLoc(SAddr),
1870 MVT::i32, TFI, SAddr.getOperand(1)),
1871 0);
1872 }
1873
1874 return SAddr;
1875}
1876
1877// Match (32-bit SGPR base) + sext(imm offset)
1878bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr,
1879 SDValue &SAddr,
1880 SDValue &Offset) const {
1881 if (Addr->isDivergent())
1882 return false;
1883
1884 SDLoc DL(Addr);
1885
1886 int64_t COffsetVal = 0;
1887
1888 if (CurDAG->isBaseWithConstantOffset(Addr) && isFlatScratchBaseLegal(Addr)) {
1889 COffsetVal = cast<ConstantSDNode>(Addr.getOperand(1))->getSExtValue();
1890 SAddr = Addr.getOperand(0);
1891 } else {
1892 SAddr = Addr;
1893 }
1894
1895 SAddr = SelectSAddrFI(CurDAG, SAddr);
1896
1897 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1898
1899 if (!TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS,
1901 int64_t SplitImmOffset, RemainderOffset;
1902 std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
1904
1905 COffsetVal = SplitImmOffset;
1906
1907 SDValue AddOffset =
1909 ? getMaterializedScalarImm32(Lo_32(RemainderOffset), DL)
1910 : CurDAG->getTargetConstant(RemainderOffset, DL, MVT::i32);
1911 SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, DL, MVT::i32,
1912 SAddr, AddOffset),
1913 0);
1914 }
1915
1916 Offset = CurDAG->getTargetConstant(COffsetVal, DL, MVT::i16);
1917
1918 return true;
1919}
1920
1921// Check whether the flat scratch SVS swizzle bug affects this access.
1922bool AMDGPUDAGToDAGISel::checkFlatScratchSVSSwizzleBug(
1923 SDValue VAddr, SDValue SAddr, uint64_t ImmOffset) const {
1924 if (!Subtarget->hasFlatScratchSVSSwizzleBug())
1925 return false;
1926
1927 // The bug affects the swizzling of SVS accesses if there is any carry out
1928 // from the two low order bits (i.e. from bit 1 into bit 2) when adding
1929 // voffset to (soffset + inst_offset).
1930 KnownBits VKnown = CurDAG->computeKnownBits(VAddr);
1932 /*Add=*/true, /*NSW=*/false, /*NUW=*/false,
1933 CurDAG->computeKnownBits(SAddr),
1934 KnownBits::makeConstant(APInt(32, ImmOffset)));
1935 uint64_t VMax = VKnown.getMaxValue().getZExtValue();
1937 return (VMax & 3) + (SMax & 3) >= 4;
1938}
1939
1940bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
1941 SDValue &VAddr, SDValue &SAddr,
1942 SDValue &Offset) const {
1943 int64_t ImmOffset = 0;
1944
1945 SDValue LHS, RHS;
1946 SDValue OrigAddr = Addr;
1947 if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
1948 int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
1949 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1950
1951 if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, true)) {
1952 Addr = LHS;
1953 ImmOffset = COffsetVal;
1954 } else if (!LHS->isDivergent() && COffsetVal > 0) {
1955 SDLoc SL(N);
1956 // saddr + large_offset -> saddr + (vaddr = large_offset & ~MaxOffset) +
1957 // (large_offset & MaxOffset);
1958 int64_t SplitImmOffset, RemainderOffset;
1959 std::tie(SplitImmOffset, RemainderOffset)
1960 = TII->splitFlatOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, true);
1961
1962 if (isUInt<32>(RemainderOffset)) {
1963 SDNode *VMov = CurDAG->getMachineNode(
1964 AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
1965 CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
1966 VAddr = SDValue(VMov, 0);
1967 SAddr = LHS;
1968 if (!isFlatScratchBaseLegal(Addr))
1969 return false;
1970 if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, SplitImmOffset))
1971 return false;
1972 Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i16);
1973 return true;
1974 }
1975 }
1976 }
1977
1978 if (Addr.getOpcode() != ISD::ADD)
1979 return false;
1980
1981 LHS = Addr.getOperand(0);
1982 RHS = Addr.getOperand(1);
1983
1984 if (!LHS->isDivergent() && RHS->isDivergent()) {
1985 SAddr = LHS;
1986 VAddr = RHS;
1987 } else if (!RHS->isDivergent() && LHS->isDivergent()) {
1988 SAddr = RHS;
1989 VAddr = LHS;
1990 } else {
1991 return false;
1992 }
1993
1994 if (OrigAddr != Addr) {
1995 if (!isFlatScratchBaseLegalSVImm(OrigAddr))
1996 return false;
1997 } else {
1998 if (!isFlatScratchBaseLegalSV(OrigAddr))
1999 return false;
2000 }
2001
2002 if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, ImmOffset))
2003 return false;
2004 SAddr = SelectSAddrFI(CurDAG, SAddr);
2005 Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16);
2006 return true;
2007}
2008
2009// Match an immediate (if Offset is not null) or an SGPR (if SOffset is
2010// not null) offset. If Imm32Only is true, match only 32-bit immediate
2011// offsets available on CI.
2012bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
2013 SDValue *SOffset, SDValue *Offset,
2014 bool Imm32Only, bool IsBuffer) const {
2015 assert((!SOffset || !Offset) &&
2016 "Cannot match both soffset and offset at the same time!");
2017
2018 ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode);
2019 if (!C) {
2020 if (!SOffset)
2021 return false;
2022 if (ByteOffsetNode.getValueType().isScalarInteger() &&
2023 ByteOffsetNode.getValueType().getSizeInBits() == 32) {
2024 *SOffset = ByteOffsetNode;
2025 return true;
2026 }
2027 if (ByteOffsetNode.getOpcode() == ISD::ZERO_EXTEND) {
2028 if (ByteOffsetNode.getOperand(0).getValueType().getSizeInBits() == 32) {
2029 *SOffset = ByteOffsetNode.getOperand(0);
2030 return true;
2031 }
2032 }
2033 return false;
2034 }
2035
2036 SDLoc SL(ByteOffsetNode);
2037
2038 // GFX9 and GFX10 have signed byte immediate offsets. The immediate
2039 // offset for S_BUFFER instructions is unsigned.
2040 int64_t ByteOffset = IsBuffer ? C->getZExtValue() : C->getSExtValue();
2041 std::optional<int64_t> EncodedOffset =
2042 AMDGPU::getSMRDEncodedOffset(*Subtarget, ByteOffset, IsBuffer);
2043 if (EncodedOffset && Offset && !Imm32Only) {
2044 *Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);
2045 return true;
2046 }
2047
2048 // SGPR and literal offsets are unsigned.
2049 if (ByteOffset < 0)
2050 return false;
2051
2052 EncodedOffset = AMDGPU::getSMRDEncodedLiteralOffset32(*Subtarget, ByteOffset);
2053 if (EncodedOffset && Offset && Imm32Only) {
2054 *Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);
2055 return true;
2056 }
2057
2058 if (!isUInt<32>(ByteOffset) && !isInt<32>(ByteOffset))
2059 return false;
2060
2061 if (SOffset) {
2062 SDValue C32Bit = CurDAG->getTargetConstant(ByteOffset, SL, MVT::i32);
2063 *SOffset = SDValue(
2064 CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, C32Bit), 0);
2065 return true;
2066 }
2067
2068 return false;
2069}
2070
2071SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const {
2072 if (Addr.getValueType() != MVT::i32)
2073 return Addr;
2074
2075 // Zero-extend a 32-bit address.
2076 SDLoc SL(Addr);
2077
2080 unsigned AddrHiVal = Info->get32BitAddressHighBits();
2081 SDValue AddrHi = CurDAG->getTargetConstant(AddrHiVal, SL, MVT::i32);
2082
2083 const SDValue Ops[] = {
2084 CurDAG->getTargetConstant(AMDGPU::SReg_64_XEXECRegClassID, SL, MVT::i32),
2085 Addr,
2086 CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
2087 SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, AddrHi),
2088 0),
2089 CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32),
2090 };
2091
2092 return SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, SL, MVT::i64,
2093 Ops), 0);
2094}
2095
2096// Match a base and an immediate (if Offset is not null) or an SGPR (if
2097// SOffset is not null) or an immediate+SGPR offset. If Imm32Only is
2098// true, match only 32-bit immediate offsets available on CI.
2099bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase,
2100 SDValue *SOffset, SDValue *Offset,
2101 bool Imm32Only,
2102 bool IsBuffer) const {
2103 if (SOffset && Offset) {
2104 assert(!Imm32Only && !IsBuffer);
2105 SDValue B;
2106 return SelectSMRDBaseOffset(Addr, B, nullptr, Offset) &&
2107 SelectSMRDBaseOffset(B, SBase, SOffset, nullptr);
2108 }
2109
2110 // A 32-bit (address + offset) should not cause unsigned 32-bit integer
2111 // wraparound, because s_load instructions perform the addition in 64 bits.
2112 if (Addr.getValueType() == MVT::i32 && Addr.getOpcode() == ISD::ADD &&
2113 !Addr->getFlags().hasNoUnsignedWrap())
2114 return false;
2115
2116 SDValue N0, N1;
2117 // Extract the base and offset if possible.
2118 if (CurDAG->isBaseWithConstantOffset(Addr) || Addr.getOpcode() == ISD::ADD) {
2119 N0 = Addr.getOperand(0);
2120 N1 = Addr.getOperand(1);
2121 } else if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, N0, N1)) {
2122 assert(N0 && N1 && isa<ConstantSDNode>(N1));
2123 }
2124 if (!N0 || !N1)
2125 return false;
2126 if (SelectSMRDOffset(N1, SOffset, Offset, Imm32Only, IsBuffer)) {
2127 SBase = N0;
2128 return true;
2129 }
2130 if (SelectSMRDOffset(N0, SOffset, Offset, Imm32Only, IsBuffer)) {
2131 SBase = N1;
2132 return true;
2133 }
2134 return false;
2135}
2136
2137bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase,
2138 SDValue *SOffset, SDValue *Offset,
2139 bool Imm32Only) const {
2140 if (SelectSMRDBaseOffset(Addr, SBase, SOffset, Offset, Imm32Only)) {
2141 SBase = Expand32BitAddress(SBase);
2142 return true;
2143 }
2144
2145 if (Addr.getValueType() == MVT::i32 && Offset && !SOffset) {
2146 SBase = Expand32BitAddress(Addr);
2147 *Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
2148 return true;
2149 }
2150
2151 return false;
2152}
2153
2154bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr, SDValue &SBase,
2155 SDValue &Offset) const {
2156 return SelectSMRD(Addr, SBase, /* SOffset */ nullptr, &Offset);
2157}
2158
2159bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase,
2160 SDValue &Offset) const {
2162 return SelectSMRD(Addr, SBase, /* SOffset */ nullptr, &Offset,
2163 /* Imm32Only */ true);
2164}
2165
2166bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDValue Addr, SDValue &SBase,
2167 SDValue &SOffset) const {
2168 return SelectSMRD(Addr, SBase, &SOffset, /* Offset */ nullptr);
2169}
2170
2171bool AMDGPUDAGToDAGISel::SelectSMRDSgprImm(SDValue Addr, SDValue &SBase,
2172 SDValue &SOffset,
2173 SDValue &Offset) const {
2174 return SelectSMRD(Addr, SBase, &SOffset, &Offset);
2175}
2176
2177bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue N, SDValue &Offset) const {
2178 return SelectSMRDOffset(N, /* SOffset */ nullptr, &Offset,
2179 /* Imm32Only */ false, /* IsBuffer */ true);
2180}
2181
2182bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue N,
2183 SDValue &Offset) const {
2185 return SelectSMRDOffset(N, /* SOffset */ nullptr, &Offset,
2186 /* Imm32Only */ true, /* IsBuffer */ true);
2187}
2188
2189bool AMDGPUDAGToDAGISel::SelectSMRDBufferSgprImm(SDValue N, SDValue &SOffset,
2190 SDValue &Offset) const {
2191 // Match the (soffset + offset) pair as a 32-bit register base and
2192 // an immediate offset.
2193 return N.getValueType() == MVT::i32 &&
2194 SelectSMRDBaseOffset(N, /* SBase */ SOffset, /* SOffset*/ nullptr,
2195 &Offset, /* Imm32Only */ false,
2196 /* IsBuffer */ true);
2197}
2198
2199bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index,
2200 SDValue &Base,
2201 SDValue &Offset) const {
2202 SDLoc DL(Index);
2203
2205 SDValue N0 = Index.getOperand(0);
2206 SDValue N1 = Index.getOperand(1);
2207 ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
2208
2209 // (add n0, c0)
2210 // Don't peel off the offset (c0) if doing so could possibly lead
2211 // the base (n0) to be negative.
2212 // (or n0, |c0|) can never change a sign given isBaseWithConstantOffset.
2213 if (C1->getSExtValue() <= 0 || CurDAG->SignBitIsZero(N0) ||
2214 (Index->getOpcode() == ISD::OR && C1->getSExtValue() >= 0)) {
2215 Base = N0;
2216 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
2217 return true;
2218 }
2219 }
2220
2221 if (isa<ConstantSDNode>(Index))
2222 return false;
2223
2224 Base = Index;
2225 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
2226 return true;
2227}
2228
2229SDNode *AMDGPUDAGToDAGISel::getBFE32(bool IsSigned, const SDLoc &DL,
2230 SDValue Val, uint32_t Offset,
2231 uint32_t Width) {
2232 if (Val->isDivergent()) {
2233 unsigned Opcode = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2235 SDValue W = CurDAG->getTargetConstant(Width, DL, MVT::i32);
2236
2237 return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, Off, W);
2238 }
2239 unsigned Opcode = IsSigned ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2240 // Transformation function, pack the offset and width of a BFE into
2241 // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
2242 // source, bits [5:0] contain the offset and bits [22:16] the width.
2243 uint32_t PackedVal = Offset | (Width << 16);
2244 SDValue PackedConst = CurDAG->getTargetConstant(PackedVal, DL, MVT::i32);
2245
2246 return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, PackedConst);
2247}
2248
2249void AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode *N) {
2250 // "(a << b) srl c)" ---> "BFE_U32 a, (c-b), (32-c)
2251 // "(a << b) sra c)" ---> "BFE_I32 a, (c-b), (32-c)
2252 // Predicate: 0 < b <= c < 32
2253
2254 const SDValue &Shl = N->getOperand(0);
2255 ConstantSDNode *B = dyn_cast<ConstantSDNode>(Shl->getOperand(1));
2256 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
2257
2258 if (B && C) {
2259 uint32_t BVal = B->getZExtValue();
2260 uint32_t CVal = C->getZExtValue();
2261
2262 if (0 < BVal && BVal <= CVal && CVal < 32) {
2263 bool Signed = N->getOpcode() == ISD::SRA;
2264 ReplaceNode(N, getBFE32(Signed, SDLoc(N), Shl.getOperand(0), CVal - BVal,
2265 32 - CVal));
2266 return;
2267 }
2268 }
2269 SelectCode(N);
2270}
2271
2272void AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) {
2273 switch (N->getOpcode()) {
2274 case ISD::AND:
2275 if (N->getOperand(0).getOpcode() == ISD::SRL) {
2276 // "(a srl b) & mask" ---> "BFE_U32 a, b, popcount(mask)"
2277 // Predicate: isMask(mask)
2278 const SDValue &Srl = N->getOperand(0);
2279 ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(Srl.getOperand(1));
2280 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(N->getOperand(1));
2281
2282 if (Shift && Mask) {
2283 uint32_t ShiftVal = Shift->getZExtValue();
2284 uint32_t MaskVal = Mask->getZExtValue();
2285
2286 if (isMask_32(MaskVal)) {
2287 uint32_t WidthVal = llvm::popcount(MaskVal);
2288 ReplaceNode(N, getBFE32(false, SDLoc(N), Srl.getOperand(0), ShiftVal,
2289 WidthVal));
2290 return;
2291 }
2292 }
2293 }
2294 break;
2295 case ISD::SRL:
2296 if (N->getOperand(0).getOpcode() == ISD::AND) {
2297 // "(a & mask) srl b)" ---> "BFE_U32 a, b, popcount(mask >> b)"
2298 // Predicate: isMask(mask >> b)
2299 const SDValue &And = N->getOperand(0);
2300 ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(N->getOperand(1));
2301 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(And->getOperand(1));
2302
2303 if (Shift && Mask) {
2304 uint32_t ShiftVal = Shift->getZExtValue();
2305 uint32_t MaskVal = Mask->getZExtValue() >> ShiftVal;
2306
2307 if (isMask_32(MaskVal)) {
2308 uint32_t WidthVal = llvm::popcount(MaskVal);
2309 ReplaceNode(N, getBFE32(false, SDLoc(N), And.getOperand(0), ShiftVal,
2310 WidthVal));
2311 return;
2312 }
2313 }
2314 } else if (N->getOperand(0).getOpcode() == ISD::SHL) {
2315 SelectS_BFEFromShifts(N);
2316 return;
2317 }
2318 break;
2319 case ISD::SRA:
2320 if (N->getOperand(0).getOpcode() == ISD::SHL) {
2321 SelectS_BFEFromShifts(N);
2322 return;
2323 }
2324 break;
2325
2327 // sext_inreg (srl x, 16), i8 -> bfe_i32 x, 16, 8
2328 SDValue Src = N->getOperand(0);
2329 if (Src.getOpcode() != ISD::SRL)
2330 break;
2331
2332 const ConstantSDNode *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(1));
2333 if (!Amt)
2334 break;
2335
2336 unsigned Width = cast<VTSDNode>(N->getOperand(1))->getVT().getSizeInBits();
2337 ReplaceNode(N, getBFE32(true, SDLoc(N), Src.getOperand(0),
2338 Amt->getZExtValue(), Width));
2339 return;
2340 }
2341 }
2342
2343 SelectCode(N);
2344}
2345
2346bool AMDGPUDAGToDAGISel::isCBranchSCC(const SDNode *N) const {
2347 assert(N->getOpcode() == ISD::BRCOND);
2348 if (!N->hasOneUse())
2349 return false;
2350
2351 SDValue Cond = N->getOperand(1);
2352 if (Cond.getOpcode() == ISD::CopyToReg)
2353 Cond = Cond.getOperand(2);
2354
2355 if (Cond.getOpcode() != ISD::SETCC || !Cond.hasOneUse())
2356 return false;
2357
2358 MVT VT = Cond.getOperand(0).getSimpleValueType();
2359 if (VT == MVT::i32)
2360 return true;
2361
2362 if (VT == MVT::i64) {
2363 auto ST = static_cast<const GCNSubtarget *>(Subtarget);
2364
2365 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
2366 return (CC == ISD::SETEQ || CC == ISD::SETNE) && ST->hasScalarCompareEq64();
2367 }
2368
2369 return false;
2370}
2371
2372static SDValue combineBallotPattern(SDValue VCMP, bool &Negate) {
2373 assert(VCMP->getOpcode() == AMDGPUISD::SETCC);
2374 // Special case for amdgcn.ballot:
2375 // %Cond = i1 (and/or combination of i1 ISD::SETCCs)
2376 // %VCMP = i(WaveSize) AMDGPUISD::SETCC (ext %Cond), 0, setne/seteq
2377 // =>
2378 // Use i1 %Cond value instead of i(WaveSize) %VCMP.
2379 // This is possible because divergent ISD::SETCC is selected as V_CMP and
2380 // Cond becomes a i(WaveSize) full mask value.
2381 // Note that ballot doesn't use SETEQ condition but its easy to support it
2382 // here for completeness, so in this case Negate is set true on return.
2383 auto VCMP_CC = cast<CondCodeSDNode>(VCMP.getOperand(2))->get();
2384 if ((VCMP_CC == ISD::SETEQ || VCMP_CC == ISD::SETNE) &&
2385 isNullConstant(VCMP.getOperand(1))) {
2386
2387 auto Cond = VCMP.getOperand(0);
2388 if (ISD::isExtOpcode(Cond->getOpcode())) // Skip extension.
2389 Cond = Cond.getOperand(0);
2390
2391 if (isBoolSGPR(Cond)) {
2392 Negate = VCMP_CC == ISD::SETEQ;
2393 return Cond;
2394 }
2395 }
2396 return SDValue();
2397}
2398
2399void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
2400 SDValue Cond = N->getOperand(1);
2401
2402 if (Cond.isUndef()) {
2403 CurDAG->SelectNodeTo(N, AMDGPU::SI_BR_UNDEF, MVT::Other,
2404 N->getOperand(2), N->getOperand(0));
2405 return;
2406 }
2407
2408 const GCNSubtarget *ST = static_cast<const GCNSubtarget *>(Subtarget);
2409 const SIRegisterInfo *TRI = ST->getRegisterInfo();
2410
2411 bool UseSCCBr = isCBranchSCC(N) && isUniformBr(N);
2412 bool AndExec = !UseSCCBr;
2413 bool Negate = false;
2414
2415 if (Cond.getOpcode() == ISD::SETCC &&
2416 Cond->getOperand(0)->getOpcode() == AMDGPUISD::SETCC) {
2417 SDValue VCMP = Cond->getOperand(0);
2418 auto CC = cast<CondCodeSDNode>(Cond->getOperand(2))->get();
2419 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
2420 isNullConstant(Cond->getOperand(1)) &&
2421 // We may encounter ballot.i64 in wave32 mode on -O0.
2422 VCMP.getValueType().getSizeInBits() == ST->getWavefrontSize()) {
2423 // %VCMP = i(WaveSize) AMDGPUISD::SETCC ...
2424 // %C = i1 ISD::SETCC %VCMP, 0, setne/seteq
2425 // BRCOND i1 %C, %BB
2426 // =>
2427 // %VCMP = i(WaveSize) AMDGPUISD::SETCC ...
2428 // VCC = COPY i(WaveSize) %VCMP
2429 // S_CBRANCH_VCCNZ/VCCZ %BB
2430 Negate = CC == ISD::SETEQ;
2431 bool NegatedBallot = false;
2432 if (auto BallotCond = combineBallotPattern(VCMP, NegatedBallot)) {
2433 Cond = BallotCond;
2434 UseSCCBr = !BallotCond->isDivergent();
2435 Negate = Negate ^ NegatedBallot;
2436 } else {
2437 // TODO: don't use SCC here assuming that AMDGPUISD::SETCC is always
2438 // selected as V_CMP, but this may change for uniform condition.
2439 Cond = VCMP;
2440 UseSCCBr = false;
2441 }
2442 }
2443 // Cond is either V_CMP resulted from AMDGPUISD::SETCC or a combination of
2444 // V_CMPs resulted from ballot or ballot has uniform condition and SCC is
2445 // used.
2446 AndExec = false;
2447 }
2448
2449 unsigned BrOp =
2450 UseSCCBr ? (Negate ? AMDGPU::S_CBRANCH_SCC0 : AMDGPU::S_CBRANCH_SCC1)
2451 : (Negate ? AMDGPU::S_CBRANCH_VCCZ : AMDGPU::S_CBRANCH_VCCNZ);
2452 Register CondReg = UseSCCBr ? AMDGPU::SCC : TRI->getVCC();
2453 SDLoc SL(N);
2454
2455 if (AndExec) {
2456 // This is the case that we are selecting to S_CBRANCH_VCCNZ. We have not
2457 // analyzed what generates the vcc value, so we do not know whether vcc
2458 // bits for disabled lanes are 0. Thus we need to mask out bits for
2459 // disabled lanes.
2460 //
2461 // For the case that we select S_CBRANCH_SCC1 and it gets
2462 // changed to S_CBRANCH_VCCNZ in SIFixSGPRCopies, SIFixSGPRCopies calls
2463 // SIInstrInfo::moveToVALU which inserts the S_AND).
2464 //
2465 // We could add an analysis of what generates the vcc value here and omit
2466 // the S_AND when is unnecessary. But it would be better to add a separate
2467 // pass after SIFixSGPRCopies to do the unnecessary S_AND removal, so it
2468 // catches both cases.
2469 Cond = SDValue(CurDAG->getMachineNode(ST->isWave32() ? AMDGPU::S_AND_B32
2470 : AMDGPU::S_AND_B64,
2471 SL, MVT::i1,
2472 CurDAG->getRegister(ST->isWave32() ? AMDGPU::EXEC_LO
2473 : AMDGPU::EXEC,
2474 MVT::i1),
2475 Cond),
2476 0);
2477 }
2478
2479 SDValue VCC = CurDAG->getCopyToReg(N->getOperand(0), SL, CondReg, Cond);
2480 CurDAG->SelectNodeTo(N, BrOp, MVT::Other,
2481 N->getOperand(2), // Basic Block
2482 VCC.getValue(0));
2483}
2484
2485void AMDGPUDAGToDAGISel::SelectFP_EXTEND(SDNode *N) {
2486 if (Subtarget->hasSALUFloatInsts() && N->getValueType(0) == MVT::f32 &&
2487 !N->isDivergent()) {
2488 SDValue Src = N->getOperand(0);
2489 if (Src.getValueType() == MVT::f16) {
2490 if (isExtractHiElt(Src, Src)) {
2491 CurDAG->SelectNodeTo(N, AMDGPU::S_CVT_HI_F32_F16, N->getVTList(),
2492 {Src});
2493 return;
2494 }
2495 }
2496 }
2497
2498 SelectCode(N);
2499}
2500
2501void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode *N, unsigned IntrID) {
2502 // The address is assumed to be uniform, so if it ends up in a VGPR, it will
2503 // be copied to an SGPR with readfirstlane.
2504 unsigned Opc = IntrID == Intrinsic::amdgcn_ds_append ?
2505 AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
2506
2507 SDValue Chain = N->getOperand(0);
2508 SDValue Ptr = N->getOperand(2);
2509 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2510 MachineMemOperand *MMO = M->getMemOperand();
2511 bool IsGDS = M->getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
2512
2515 SDValue PtrBase = Ptr.getOperand(0);
2516 SDValue PtrOffset = Ptr.getOperand(1);
2517
2518 const APInt &OffsetVal = PtrOffset->getAsAPIntVal();
2519 if (isDSOffsetLegal(PtrBase, OffsetVal.getZExtValue())) {
2520 N = glueCopyToM0(N, PtrBase);
2521 Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i32);
2522 }
2523 }
2524
2525 if (!Offset) {
2526 N = glueCopyToM0(N, Ptr);
2527 Offset = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32);
2528 }
2529
2530 SDValue Ops[] = {
2531 Offset,
2532 CurDAG->getTargetConstant(IsGDS, SDLoc(), MVT::i32),
2533 Chain,
2534 N->getOperand(N->getNumOperands() - 1) // New glue
2535 };
2536
2537 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
2538 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
2539}
2540
2541// We need to handle this here because tablegen doesn't support matching
2542// instructions with multiple outputs.
2543void AMDGPUDAGToDAGISel::SelectDSBvhStackIntrinsic(SDNode *N) {
2544 unsigned Opc = AMDGPU::DS_BVH_STACK_RTN_B32;
2545 SDValue Ops[] = {N->getOperand(2), N->getOperand(3), N->getOperand(4),
2546 N->getOperand(5), N->getOperand(0)};
2547
2548 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2549 MachineMemOperand *MMO = M->getMemOperand();
2550 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
2551 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
2552}
2553
2554void AMDGPUDAGToDAGISel::SelectPOPSExitingWaveID(SDNode *N) {
2555 // TODO: Select this with a tablegen pattern. This is tricky because the
2556 // intrinsic is IntrReadMem/IntrWriteMem but the instruction is not marked
2557 // mayLoad/mayStore and tablegen complains about the mismatch.
2558 SDValue Reg = CurDAG->getRegister(AMDGPU::SRC_POPS_EXITING_WAVE_ID, MVT::i32);
2559 CurDAG->SelectNodeTo(N, AMDGPU::S_MOV_B32, N->getVTList(), Reg);
2560}
2561
2562static unsigned gwsIntrinToOpcode(unsigned IntrID) {
2563 switch (IntrID) {
2564 case Intrinsic::amdgcn_ds_gws_init:
2565 return AMDGPU::DS_GWS_INIT;
2566 case Intrinsic::amdgcn_ds_gws_barrier:
2567 return AMDGPU::DS_GWS_BARRIER;
2568 case Intrinsic::amdgcn_ds_gws_sema_v:
2569 return AMDGPU::DS_GWS_SEMA_V;
2570 case Intrinsic::amdgcn_ds_gws_sema_br:
2571 return AMDGPU::DS_GWS_SEMA_BR;
2572 case Intrinsic::amdgcn_ds_gws_sema_p:
2573 return AMDGPU::DS_GWS_SEMA_P;
2574 case Intrinsic::amdgcn_ds_gws_sema_release_all:
2575 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
2576 default:
2577 llvm_unreachable("not a gws intrinsic");
2578 }
2579}
2580
2581void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) {
2582 if (!Subtarget->hasGWS() ||
2583 (IntrID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
2584 !Subtarget->hasGWSSemaReleaseAll())) {
2585 // Let this error.
2586 SelectCode(N);
2587 return;
2588 }
2589
2590 // Chain, intrinsic ID, vsrc, offset
2591 const bool HasVSrc = N->getNumOperands() == 4;
2592 assert(HasVSrc || N->getNumOperands() == 3);
2593
2594 SDLoc SL(N);
2595 SDValue BaseOffset = N->getOperand(HasVSrc ? 3 : 2);
2596 int ImmOffset = 0;
2597 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2598 MachineMemOperand *MMO = M->getMemOperand();
2599
2600 // Don't worry if the offset ends up in a VGPR. Only one lane will have
2601 // effect, so SIFixSGPRCopies will validly insert readfirstlane.
2602
2603 // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
2604 // offset field) % 64. Some versions of the programming guide omit the m0
2605 // part, or claim it's from offset 0.
2606 if (ConstantSDNode *ConstOffset = dyn_cast<ConstantSDNode>(BaseOffset)) {
2607 // If we have a constant offset, try to use the 0 in m0 as the base.
2608 // TODO: Look into changing the default m0 initialization value. If the
2609 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
2610 // the immediate offset.
2611 glueCopyToM0(N, CurDAG->getTargetConstant(0, SL, MVT::i32));
2612 ImmOffset = ConstOffset->getZExtValue();
2613 } else {
2614 if (CurDAG->isBaseWithConstantOffset(BaseOffset)) {
2615 ImmOffset = BaseOffset.getConstantOperandVal(1);
2616 BaseOffset = BaseOffset.getOperand(0);
2617 }
2618
2619 // Prefer to do the shift in an SGPR since it should be possible to use m0
2620 // as the result directly. If it's already an SGPR, it will be eliminated
2621 // later.
2622 SDNode *SGPROffset
2623 = CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL, MVT::i32,
2624 BaseOffset);
2625 // Shift to offset in m0
2626 SDNode *M0Base
2627 = CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,
2628 SDValue(SGPROffset, 0),
2629 CurDAG->getTargetConstant(16, SL, MVT::i32));
2630 glueCopyToM0(N, SDValue(M0Base, 0));
2631 }
2632
2633 SDValue Chain = N->getOperand(0);
2634 SDValue OffsetField = CurDAG->getTargetConstant(ImmOffset, SL, MVT::i32);
2635
2636 const unsigned Opc = gwsIntrinToOpcode(IntrID);
2638 if (HasVSrc)
2639 Ops.push_back(N->getOperand(2));
2640 Ops.push_back(OffsetField);
2641 Ops.push_back(Chain);
2642
2643 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
2644 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
2645}
2646
2647void AMDGPUDAGToDAGISel::SelectInterpP1F16(SDNode *N) {
2648 if (Subtarget->getLDSBankCount() != 16) {
2649 // This is a single instruction with a pattern.
2650 SelectCode(N);
2651 return;
2652 }
2653
2654 SDLoc DL(N);
2655
2656 // This requires 2 instructions. It is possible to write a pattern to support
2657 // this, but the generated isel emitter doesn't correctly deal with multiple
2658 // output instructions using the same physical register input. The copy to m0
2659 // is incorrectly placed before the second instruction.
2660 //
2661 // TODO: Match source modifiers.
2662 //
2663 // def : Pat <
2664 // (int_amdgcn_interp_p1_f16
2665 // (VOP3Mods f32:$src0, i32:$src0_modifiers),
2666 // (i32 timm:$attrchan), (i32 timm:$attr),
2667 // (i1 timm:$high), M0),
2668 // (V_INTERP_P1LV_F16 $src0_modifiers, VGPR_32:$src0, timm:$attr,
2669 // timm:$attrchan, 0,
2670 // (V_INTERP_MOV_F32 2, timm:$attr, timm:$attrchan), timm:$high)> {
2671 // let Predicates = [has16BankLDS];
2672 // }
2673
2674 // 16 bank LDS
2675 SDValue ToM0 = CurDAG->getCopyToReg(CurDAG->getEntryNode(), DL, AMDGPU::M0,
2676 N->getOperand(5), SDValue());
2677
2678 SDVTList VTs = CurDAG->getVTList(MVT::f32, MVT::Other);
2679
2680 SDNode *InterpMov =
2681 CurDAG->getMachineNode(AMDGPU::V_INTERP_MOV_F32, DL, VTs, {
2682 CurDAG->getTargetConstant(2, DL, MVT::i32), // P0
2683 N->getOperand(3), // Attr
2684 N->getOperand(2), // Attrchan
2685 ToM0.getValue(1) // In glue
2686 });
2687
2688 SDNode *InterpP1LV =
2689 CurDAG->getMachineNode(AMDGPU::V_INTERP_P1LV_F16, DL, MVT::f32, {
2690 CurDAG->getTargetConstant(0, DL, MVT::i32), // $src0_modifiers
2691 N->getOperand(1), // Src0
2692 N->getOperand(3), // Attr
2693 N->getOperand(2), // Attrchan
2694 CurDAG->getTargetConstant(0, DL, MVT::i32), // $src2_modifiers
2695 SDValue(InterpMov, 0), // Src2 - holds two f16 values selected by high
2696 N->getOperand(4), // high
2697 CurDAG->getTargetConstant(0, DL, MVT::i1), // $clamp
2698 CurDAG->getTargetConstant(0, DL, MVT::i32), // $omod
2699 SDValue(InterpMov, 1)
2700 });
2701
2702 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), SDValue(InterpP1LV, 0));
2703}
2704
2705void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
2706 unsigned IntrID = N->getConstantOperandVal(1);
2707 switch (IntrID) {
2708 case Intrinsic::amdgcn_ds_append:
2709 case Intrinsic::amdgcn_ds_consume: {
2710 if (N->getValueType(0) != MVT::i32)
2711 break;
2712 SelectDSAppendConsume(N, IntrID);
2713 return;
2714 }
2715 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2716 SelectDSBvhStackIntrinsic(N);
2717 return;
2718 case Intrinsic::amdgcn_pops_exiting_wave_id:
2719 SelectPOPSExitingWaveID(N);
2720 return;
2721 }
2722
2723 SelectCode(N);
2724}
2725
2726void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) {
2727 unsigned IntrID = N->getConstantOperandVal(0);
2728 unsigned Opcode = AMDGPU::INSTRUCTION_LIST_END;
2729 SDNode *ConvGlueNode = N->getGluedNode();
2730 if (ConvGlueNode) {
2731 // FIXME: Possibly iterate over multiple glue nodes?
2732 assert(ConvGlueNode->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
2733 ConvGlueNode = ConvGlueNode->getOperand(0).getNode();
2734 ConvGlueNode =
2735 CurDAG->getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, {},
2736 MVT::Glue, SDValue(ConvGlueNode, 0));
2737 } else {
2738 ConvGlueNode = nullptr;
2739 }
2740 switch (IntrID) {
2741 case Intrinsic::amdgcn_wqm:
2742 Opcode = AMDGPU::WQM;
2743 break;
2744 case Intrinsic::amdgcn_softwqm:
2745 Opcode = AMDGPU::SOFT_WQM;
2746 break;
2747 case Intrinsic::amdgcn_wwm:
2748 case Intrinsic::amdgcn_strict_wwm:
2749 Opcode = AMDGPU::STRICT_WWM;
2750 break;
2751 case Intrinsic::amdgcn_strict_wqm:
2752 Opcode = AMDGPU::STRICT_WQM;
2753 break;
2754 case Intrinsic::amdgcn_interp_p1_f16:
2755 SelectInterpP1F16(N);
2756 return;
2757 case Intrinsic::amdgcn_inverse_ballot:
2758 switch (N->getOperand(1).getValueSizeInBits()) {
2759 case 32:
2760 Opcode = AMDGPU::S_INVERSE_BALLOT_U32;
2761 break;
2762 case 64:
2763 Opcode = AMDGPU::S_INVERSE_BALLOT_U64;
2764 break;
2765 default:
2766 llvm_unreachable("Unsupported size for inverse ballot mask.");
2767 }
2768 break;
2769 default:
2770 SelectCode(N);
2771 break;
2772 }
2773
2774 if (Opcode != AMDGPU::INSTRUCTION_LIST_END) {
2775 SDValue Src = N->getOperand(1);
2776 CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), {Src});
2777 }
2778
2779 if (ConvGlueNode) {
2780 SmallVector<SDValue, 4> NewOps(N->op_begin(), N->op_end());
2781 NewOps.push_back(SDValue(ConvGlueNode, 0));
2782 CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), NewOps);
2783 }
2784}
2785
2786void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) {
2787 unsigned IntrID = N->getConstantOperandVal(1);
2788 switch (IntrID) {
2789 case Intrinsic::amdgcn_ds_gws_init:
2790 case Intrinsic::amdgcn_ds_gws_barrier:
2791 case Intrinsic::amdgcn_ds_gws_sema_v:
2792 case Intrinsic::amdgcn_ds_gws_sema_br:
2793 case Intrinsic::amdgcn_ds_gws_sema_p:
2794 case Intrinsic::amdgcn_ds_gws_sema_release_all:
2795 SelectDS_GWS(N, IntrID);
2796 return;
2797 default:
2798 break;
2799 }
2800
2801 SelectCode(N);
2802}
2803
2804void AMDGPUDAGToDAGISel::SelectWAVE_ADDRESS(SDNode *N) {
2805 SDValue Log2WaveSize =
2806 CurDAG->getTargetConstant(Subtarget->getWavefrontSizeLog2(), SDLoc(N), MVT::i32);
2807 CurDAG->SelectNodeTo(N, AMDGPU::S_LSHR_B32, N->getVTList(),
2808 {N->getOperand(0), Log2WaveSize});
2809}
2810
2811void AMDGPUDAGToDAGISel::SelectSTACKRESTORE(SDNode *N) {
2812 SDValue SrcVal = N->getOperand(1);
2813 if (SrcVal.getValueType() != MVT::i32) {
2814 SelectCode(N); // Emit default error
2815 return;
2816 }
2817
2818 SDValue CopyVal;
2820 SDLoc SL(N);
2821
2822 if (SrcVal.getOpcode() == AMDGPUISD::WAVE_ADDRESS) {
2823 CopyVal = SrcVal.getOperand(0);
2824 } else {
2825 SDValue Log2WaveSize = CurDAG->getTargetConstant(
2826 Subtarget->getWavefrontSizeLog2(), SL, MVT::i32);
2827
2828 if (N->isDivergent()) {
2829 SrcVal = SDValue(CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL,
2830 MVT::i32, SrcVal),
2831 0);
2832 }
2833
2834 CopyVal = SDValue(CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,
2835 {SrcVal, Log2WaveSize}),
2836 0);
2837 }
2838
2839 SDValue CopyToSP = CurDAG->getCopyToReg(N->getOperand(0), SL, SP, CopyVal);
2840 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), CopyToSP);
2841}
2842
2843bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src,
2844 unsigned &Mods,
2845 bool IsCanonicalizing,
2846 bool AllowAbs) const {
2847 Mods = SISrcMods::NONE;
2848 Src = In;
2849
2850 if (Src.getOpcode() == ISD::FNEG) {
2851 Mods |= SISrcMods::NEG;
2852 Src = Src.getOperand(0);
2853 } else if (Src.getOpcode() == ISD::FSUB && IsCanonicalizing) {
2854 // Fold fsub [+-]0 into fneg. This may not have folded depending on the
2855 // denormal mode, but we're implicitly canonicalizing in a source operand.
2856 auto *LHS = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
2857 if (LHS && LHS->isZero()) {
2858 Mods |= SISrcMods::NEG;
2859 Src = Src.getOperand(1);
2860 }
2861 }
2862
2863 if (AllowAbs && Src.getOpcode() == ISD::FABS) {
2864 Mods |= SISrcMods::ABS;
2865 Src = Src.getOperand(0);
2866 }
2867
2868 return true;
2869}
2870
2871bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src,
2872 SDValue &SrcMods) const {
2873 unsigned Mods;
2874 if (SelectVOP3ModsImpl(In, Src, Mods, /*IsCanonicalizing=*/true,
2875 /*AllowAbs=*/true)) {
2876 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2877 return true;
2878 }
2879
2880 return false;
2881}
2882
2883bool AMDGPUDAGToDAGISel::SelectVOP3ModsNonCanonicalizing(
2884 SDValue In, SDValue &Src, SDValue &SrcMods) const {
2885 unsigned Mods;
2886 if (SelectVOP3ModsImpl(In, Src, Mods, /*IsCanonicalizing=*/false,
2887 /*AllowAbs=*/true)) {
2888 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2889 return true;
2890 }
2891
2892 return false;
2893}
2894
2895bool AMDGPUDAGToDAGISel::SelectVOP3BMods(SDValue In, SDValue &Src,
2896 SDValue &SrcMods) const {
2897 unsigned Mods;
2898 if (SelectVOP3ModsImpl(In, Src, Mods,
2899 /*IsCanonicalizing=*/true,
2900 /*AllowAbs=*/false)) {
2901 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2902 return true;
2903 }
2904
2905 return false;
2906}
2907
2908bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src) const {
2909 if (In.getOpcode() == ISD::FABS || In.getOpcode() == ISD::FNEG)
2910 return false;
2911
2912 Src = In;
2913 return true;
2914}
2915
2916bool AMDGPUDAGToDAGISel::SelectVINTERPModsImpl(SDValue In, SDValue &Src,
2917 SDValue &SrcMods,
2918 bool OpSel) const {
2919 unsigned Mods;
2920 if (SelectVOP3ModsImpl(In, Src, Mods,
2921 /*IsCanonicalizing=*/true,
2922 /*AllowAbs=*/false)) {
2923 if (OpSel)
2924 Mods |= SISrcMods::OP_SEL_0;
2925 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2926 return true;
2927 }
2928
2929 return false;
2930}
2931
2932bool AMDGPUDAGToDAGISel::SelectVINTERPMods(SDValue In, SDValue &Src,
2933 SDValue &SrcMods) const {
2934 return SelectVINTERPModsImpl(In, Src, SrcMods, /* OpSel */ false);
2935}
2936
2937bool AMDGPUDAGToDAGISel::SelectVINTERPModsHi(SDValue In, SDValue &Src,
2938 SDValue &SrcMods) const {
2939 return SelectVINTERPModsImpl(In, Src, SrcMods, /* OpSel */ true);
2940}
2941
2942bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src,
2943 SDValue &SrcMods, SDValue &Clamp,
2944 SDValue &Omod) const {
2945 SDLoc DL(In);
2946 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
2947 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
2948
2949 return SelectVOP3Mods(In, Src, SrcMods);
2950}
2951
2952bool AMDGPUDAGToDAGISel::SelectVOP3BMods0(SDValue In, SDValue &Src,
2953 SDValue &SrcMods, SDValue &Clamp,
2954 SDValue &Omod) const {
2955 SDLoc DL(In);
2956 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
2957 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
2958
2959 return SelectVOP3BMods(In, Src, SrcMods);
2960}
2961
2962bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In, SDValue &Src,
2963 SDValue &Clamp, SDValue &Omod) const {
2964 Src = In;
2965
2966 SDLoc DL(In);
2967 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
2968 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
2969
2970 return true;
2971}
2972
2973bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
2974 SDValue &SrcMods, bool IsDOT) const {
2975 unsigned Mods = SISrcMods::NONE;
2976 Src = In;
2977
2978 // TODO: Handle G_FSUB 0 as fneg
2979 if (Src.getOpcode() == ISD::FNEG) {
2981 Src = Src.getOperand(0);
2982 }
2983
2984 if (Src.getOpcode() == ISD::BUILD_VECTOR && Src.getNumOperands() == 2 &&
2985 (!IsDOT || !Subtarget->hasDOTOpSelHazard())) {
2986 unsigned VecMods = Mods;
2987
2988 SDValue Lo = stripBitcast(Src.getOperand(0));
2989 SDValue Hi = stripBitcast(Src.getOperand(1));
2990
2991 if (Lo.getOpcode() == ISD::FNEG) {
2992 Lo = stripBitcast(Lo.getOperand(0));
2993 Mods ^= SISrcMods::NEG;
2994 }
2995
2996 if (Hi.getOpcode() == ISD::FNEG) {
2997 Hi = stripBitcast(Hi.getOperand(0));
2998 Mods ^= SISrcMods::NEG_HI;
2999 }
3000
3001 if (isExtractHiElt(Lo, Lo))
3002 Mods |= SISrcMods::OP_SEL_0;
3003
3004 if (isExtractHiElt(Hi, Hi))
3005 Mods |= SISrcMods::OP_SEL_1;
3006
3007 unsigned VecSize = Src.getValueSizeInBits();
3008 Lo = stripExtractLoElt(Lo);
3009 Hi = stripExtractLoElt(Hi);
3010
3011 if (Lo.getValueSizeInBits() > VecSize) {
3013 (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
3014 MVT::getIntegerVT(VecSize), Lo);
3015 }
3016
3017 if (Hi.getValueSizeInBits() > VecSize) {
3019 (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
3020 MVT::getIntegerVT(VecSize), Hi);
3021 }
3022
3023 assert(Lo.getValueSizeInBits() <= VecSize &&
3024 Hi.getValueSizeInBits() <= VecSize);
3025
3026 if (Lo == Hi && !isInlineImmediate(Lo.getNode())) {
3027 // Really a scalar input. Just select from the low half of the register to
3028 // avoid packing.
3029
3030 if (VecSize == 32 || VecSize == Lo.getValueSizeInBits()) {
3031 Src = Lo;
3032 } else {
3033 assert(Lo.getValueSizeInBits() == 32 && VecSize == 64);
3034
3035 SDLoc SL(In);
3037 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, SL,
3038 Lo.getValueType()), 0);
3039 auto RC = Lo->isDivergent() ? AMDGPU::VReg_64RegClassID
3040 : AMDGPU::SReg_64RegClassID;
3041 const SDValue Ops[] = {
3042 CurDAG->getTargetConstant(RC, SL, MVT::i32),
3043 Lo, CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
3044 Undef, CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32) };
3045
3046 Src = SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, SL,
3047 Src.getValueType(), Ops), 0);
3048 }
3049 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3050 return true;
3051 }
3052
3053 if (VecSize == 64 && Lo == Hi && isa<ConstantFPSDNode>(Lo)) {
3054 uint64_t Lit = cast<ConstantFPSDNode>(Lo)->getValueAPF()
3055 .bitcastToAPInt().getZExtValue();
3056 if (AMDGPU::isInlinableLiteral32(Lit, Subtarget->hasInv2PiInlineImm())) {
3057 Src = CurDAG->getTargetConstant(Lit, SDLoc(In), MVT::i64);
3058 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3059 return true;
3060 }
3061 }
3062
3063 Mods = VecMods;
3064 }
3065
3066 // Packed instructions do not have abs modifiers.
3067 Mods |= SISrcMods::OP_SEL_1;
3068
3069 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3070 return true;
3071}
3072
3073bool AMDGPUDAGToDAGISel::SelectVOP3PModsDOT(SDValue In, SDValue &Src,
3074 SDValue &SrcMods) const {
3075 return SelectVOP3PMods(In, Src, SrcMods, true);
3076}
3077
3078bool AMDGPUDAGToDAGISel::SelectVOP3PModsNeg(SDValue In, SDValue &Src) const {
3079 const ConstantSDNode *C = cast<ConstantSDNode>(In);
3080 // Literal i1 value set in intrinsic, represents SrcMods for the next operand.
3081 // 1 promotes packed values to signed, 0 treats them as unsigned.
3082 assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value");
3083
3084 unsigned Mods = SISrcMods::OP_SEL_1;
3085 unsigned SrcSign = C->getZExtValue();
3086 if (SrcSign == 1)
3087 Mods ^= SISrcMods::NEG;
3088
3089 Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3090 return true;
3091}
3092
3093bool AMDGPUDAGToDAGISel::SelectWMMAOpSelVOP3PMods(SDValue In,
3094 SDValue &Src) const {
3095 const ConstantSDNode *C = cast<ConstantSDNode>(In);
3096 assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value");
3097
3098 unsigned Mods = SISrcMods::OP_SEL_1;
3099 unsigned SrcVal = C->getZExtValue();
3100 if (SrcVal == 1)
3101 Mods |= SISrcMods::OP_SEL_0;
3102
3103 Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3104 return true;
3105}
3106
3108 llvm::SelectionDAG *CurDAG,
3109 const SDLoc &DL) {
3110 unsigned DstRegClass;
3111 EVT DstTy;
3112 switch (Elts.size()) {
3113 case 8:
3114 DstRegClass = AMDGPU::VReg_256RegClassID;
3115 DstTy = MVT::v8i32;
3116 break;
3117 case 4:
3118 DstRegClass = AMDGPU::VReg_128RegClassID;
3119 DstTy = MVT::v4i32;
3120 break;
3121 case 2:
3122 DstRegClass = AMDGPU::VReg_64RegClassID;
3123 DstTy = MVT::v2i32;
3124 break;
3125 default:
3126 llvm_unreachable("unhandled Reg sequence size");
3127 }
3128
3130 Ops.push_back(CurDAG->getTargetConstant(DstRegClass, DL, MVT::i32));
3131 for (unsigned i = 0; i < Elts.size(); ++i) {
3132 Ops.push_back(Elts[i]);
3133 Ops.push_back(CurDAG->getTargetConstant(
3135 }
3136 return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, DstTy, Ops);
3137}
3138
3140 llvm::SelectionDAG *CurDAG,
3141 const SDLoc &DL) {
3142 SmallVector<SDValue, 8> PackedElts;
3143 assert("unhandled Reg sequence size" &&
3144 (Elts.size() == 8 || Elts.size() == 16));
3145
3146 // Pack 16-bit elements in pairs into 32-bit register. If both elements are
3147 // unpacked from 32-bit source use it, otherwise pack them using v_perm.
3148 for (unsigned i = 0; i < Elts.size(); i += 2) {
3149 SDValue LoSrc = stripExtractLoElt(stripBitcast(Elts[i]));
3150 SDValue HiSrc;
3151 if (isExtractHiElt(Elts[i + 1], HiSrc) && LoSrc == HiSrc) {
3152 PackedElts.push_back(HiSrc);
3153 } else {
3154 SDValue PackLoLo = CurDAG->getTargetConstant(0x05040100, DL, MVT::i32);
3155 MachineSDNode *Packed =
3156 CurDAG->getMachineNode(AMDGPU::V_PERM_B32_e64, DL, MVT::i32,
3157 {Elts[i + 1], Elts[i], PackLoLo});
3158 PackedElts.push_back(SDValue(Packed, 0));
3159 }
3160 }
3161
3162 return buildRegSequence32(PackedElts, CurDAG, DL);
3163}
3164
3166 llvm::SelectionDAG *CurDAG,
3167 const SDLoc &DL, unsigned ElementSize) {
3168 if (ElementSize == 16)
3169 return buildRegSequence16(Elts, CurDAG, DL);
3170 if (ElementSize == 32)
3171 return buildRegSequence32(Elts, CurDAG, DL);
3172 llvm_unreachable("Unhandled element size");
3173}
3174
3175static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods,
3177 llvm::SelectionDAG *CurDAG, const SDLoc &DL,
3178 unsigned ElementSize) {
3179 if (ModOpcode == ISD::FNEG) {
3180 Mods |= SISrcMods::NEG;
3181 // Check if all elements also have abs modifier
3182 SmallVector<SDValue, 8> NegAbsElts;
3183 for (auto El : Elts) {
3184 if (El.getOpcode() != ISD::FABS)
3185 break;
3186 NegAbsElts.push_back(El->getOperand(0));
3187 }
3188 if (Elts.size() != NegAbsElts.size()) {
3189 // Neg
3190 Src = SDValue(buildRegSequence(Elts, CurDAG, DL, ElementSize), 0);
3191 } else {
3192 // Neg and Abs
3193 Mods |= SISrcMods::NEG_HI;
3194 Src = SDValue(buildRegSequence(NegAbsElts, CurDAG, DL, ElementSize), 0);
3195 }
3196 } else {
3197 assert(ModOpcode == ISD::FABS);
3198 // Abs
3199 Mods |= SISrcMods::NEG_HI;
3200 Src = SDValue(buildRegSequence(Elts, CurDAG, DL, ElementSize), 0);
3201 }
3202}
3203
3204// Check all f16 elements for modifiers while looking through b32 and v2b16
3205// build vector, stop if element does not satisfy ModifierCheck.
3206static void
3208 std::function<bool(SDValue)> ModifierCheck) {
3209 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3210 if (auto *F16Pair =
3211 dyn_cast<BuildVectorSDNode>(stripBitcast(BV->getOperand(i)))) {
3212 for (unsigned i = 0; i < F16Pair->getNumOperands(); ++i) {
3213 SDValue ElF16 = stripBitcast(F16Pair->getOperand(i));
3214 if (!ModifierCheck(ElF16))
3215 break;
3216 }
3217 }
3218 }
3219}
3220
3221bool AMDGPUDAGToDAGISel::SelectWMMAModsF16Neg(SDValue In, SDValue &Src,
3222 SDValue &SrcMods) const {
3223 Src = In;
3224 unsigned Mods = SISrcMods::OP_SEL_1;
3225
3226 // mods are on f16 elements
3227 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3229
3230 checkWMMAElementsModifiersF16(BV, [&](SDValue Element) -> bool {
3231 if (Element.getOpcode() != ISD::FNEG)
3232 return false;
3233 EltsF16.push_back(Element.getOperand(0));
3234 return true;
3235 });
3236
3237 // All elements have neg modifier
3238 if (BV->getNumOperands() * 2 == EltsF16.size()) {
3239 Src = SDValue(buildRegSequence16(EltsF16, CurDAG, SDLoc(In)), 0);
3240 Mods |= SISrcMods::NEG;
3241 Mods |= SISrcMods::NEG_HI;
3242 }
3243 }
3244
3245 // mods are on v2f16 elements
3246 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3247 SmallVector<SDValue, 8> EltsV2F16;
3248 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3249 SDValue ElV2f16 = stripBitcast(BV->getOperand(i));
3250 // Based on first element decide which mod we match, neg or abs
3251 if (ElV2f16.getOpcode() != ISD::FNEG)
3252 break;
3253 EltsV2F16.push_back(ElV2f16.getOperand(0));
3254 }
3255
3256 // All pairs of elements have neg modifier
3257 if (BV->getNumOperands() == EltsV2F16.size()) {
3258 Src = SDValue(buildRegSequence32(EltsV2F16, CurDAG, SDLoc(In)), 0);
3259 Mods |= SISrcMods::NEG;
3260 Mods |= SISrcMods::NEG_HI;
3261 }
3262 }
3263
3264 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3265 return true;
3266}
3267
3268bool AMDGPUDAGToDAGISel::SelectWMMAModsF16NegAbs(SDValue In, SDValue &Src,
3269 SDValue &SrcMods) const {
3270 Src = In;
3271 unsigned Mods = SISrcMods::OP_SEL_1;
3272 unsigned ModOpcode;
3273
3274 // mods are on f16 elements
3275 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3277 checkWMMAElementsModifiersF16(BV, [&](SDValue ElF16) -> bool {
3278 // Based on first element decide which mod we match, neg or abs
3279 if (EltsF16.empty())
3280 ModOpcode = (ElF16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3281 if (ElF16.getOpcode() != ModOpcode)
3282 return false;
3283 EltsF16.push_back(ElF16.getOperand(0));
3284 return true;
3285 });
3286
3287 // All elements have ModOpcode modifier
3288 if (BV->getNumOperands() * 2 == EltsF16.size())
3289 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF16, Src, CurDAG, SDLoc(In),
3290 16);
3291 }
3292
3293 // mods are on v2f16 elements
3294 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3295 SmallVector<SDValue, 8> EltsV2F16;
3296
3297 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3298 SDValue ElV2f16 = stripBitcast(BV->getOperand(i));
3299 // Based on first element decide which mod we match, neg or abs
3300 if (EltsV2F16.empty())
3301 ModOpcode = (ElV2f16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3302 if (ElV2f16->getOpcode() != ModOpcode)
3303 break;
3304 EltsV2F16.push_back(ElV2f16->getOperand(0));
3305 }
3306
3307 // All elements have ModOpcode modifier
3308 if (BV->getNumOperands() == EltsV2F16.size())
3309 selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, CurDAG, SDLoc(In),
3310 32);
3311 }
3312
3313 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3314 return true;
3315}
3316
3317bool AMDGPUDAGToDAGISel::SelectWMMAModsF32NegAbs(SDValue In, SDValue &Src,
3318 SDValue &SrcMods) const {
3319 Src = In;
3320 unsigned Mods = SISrcMods::OP_SEL_1;
3322
3323 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3324 assert(BV->getNumOperands() > 0);
3325 // Based on first element decide which mod we match, neg or abs
3326 SDValue ElF32 = stripBitcast(BV->getOperand(0));
3327 unsigned ModOpcode =
3328 (ElF32.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3329 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3330 SDValue ElF32 = stripBitcast(BV->getOperand(i));
3331 if (ElF32.getOpcode() != ModOpcode)
3332 break;
3333 EltsF32.push_back(ElF32.getOperand(0));
3334 }
3335
3336 // All elements had ModOpcode modifier
3337 if (BV->getNumOperands() == EltsF32.size())
3338 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, CurDAG, SDLoc(In),
3339 32);
3340 }
3341
3342 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3343 return true;
3344}
3345
3346bool AMDGPUDAGToDAGISel::SelectWMMAVISrc(SDValue In, SDValue &Src) const {
3347 if (auto *BV = dyn_cast<BuildVectorSDNode>(In)) {
3348 BitVector UndefElements;
3349 if (SDValue Splat = BV->getSplatValue(&UndefElements))
3350 if (isInlineImmediate(Splat.getNode())) {
3351 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Splat)) {
3352 unsigned Imm = C->getAPIntValue().getSExtValue();
3353 Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32);
3354 return true;
3355 }
3356 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Splat)) {
3357 unsigned Imm = C->getValueAPF().bitcastToAPInt().getSExtValue();
3358 Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32);
3359 return true;
3360 }
3361 llvm_unreachable("unhandled Constant node");
3362 }
3363 }
3364
3365 // 16 bit splat
3366 SDValue SplatSrc32 = stripBitcast(In);
3367 if (auto *SplatSrc32BV = dyn_cast<BuildVectorSDNode>(SplatSrc32))
3368 if (SDValue Splat32 = SplatSrc32BV->getSplatValue()) {
3369 SDValue SplatSrc16 = stripBitcast(Splat32);
3370 if (auto *SplatSrc16BV = dyn_cast<BuildVectorSDNode>(SplatSrc16))
3371 if (SDValue Splat = SplatSrc16BV->getSplatValue()) {
3372 const SIInstrInfo *TII = Subtarget->getInstrInfo();
3373 std::optional<APInt> RawValue;
3374 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Splat))
3375 RawValue = C->getValueAPF().bitcastToAPInt();
3376 else if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Splat))
3377 RawValue = C->getAPIntValue();
3378
3379 if (RawValue.has_value()) {
3380 EVT VT = In.getValueType().getScalarType();
3381 if (VT.getSimpleVT() == MVT::f16 || VT.getSimpleVT() == MVT::bf16) {
3382 APFloat FloatVal(VT.getSimpleVT() == MVT::f16
3385 RawValue.value());
3386 if (TII->isInlineConstant(FloatVal)) {
3387 Src = CurDAG->getTargetConstant(RawValue.value(), SDLoc(In),
3388 MVT::i16);
3389 return true;
3390 }
3391 } else if (VT.getSimpleVT() == MVT::i16) {
3392 if (TII->isInlineConstant(RawValue.value())) {
3393 Src = CurDAG->getTargetConstant(RawValue.value(), SDLoc(In),
3394 MVT::i16);
3395 return true;
3396 }
3397 } else
3398 llvm_unreachable("unknown 16-bit type");
3399 }
3400 }
3401 }
3402
3403 return false;
3404}
3405
3406bool AMDGPUDAGToDAGISel::SelectSWMMACIndex8(SDValue In, SDValue &Src,
3407 SDValue &IndexKey) const {
3408 unsigned Key = 0;
3409 Src = In;
3410
3411 if (In.getOpcode() == ISD::SRL) {
3412 const llvm::SDValue &ShiftSrc = In.getOperand(0);
3413 ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(In.getOperand(1));
3414 if (ShiftSrc.getValueType().getSizeInBits() == 32 && ShiftAmt &&
3415 ShiftAmt->getZExtValue() % 8 == 0) {
3416 Key = ShiftAmt->getZExtValue() / 8;
3417 Src = ShiftSrc;
3418 }
3419 }
3420
3421 IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);
3422 return true;
3423}
3424
3425bool AMDGPUDAGToDAGISel::SelectSWMMACIndex16(SDValue In, SDValue &Src,
3426 SDValue &IndexKey) const {
3427 unsigned Key = 0;
3428 Src = In;
3429
3430 if (In.getOpcode() == ISD::SRL) {
3431 const llvm::SDValue &ShiftSrc = In.getOperand(0);
3432 ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(In.getOperand(1));
3433 if (ShiftSrc.getValueType().getSizeInBits() == 32 && ShiftAmt &&
3434 ShiftAmt->getZExtValue() == 16) {
3435 Key = 1;
3436 Src = ShiftSrc;
3437 }
3438 }
3439
3440 IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);
3441 return true;
3442}
3443
3444bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src,
3445 SDValue &SrcMods) const {
3446 Src = In;
3447 // FIXME: Handle op_sel
3448 SrcMods = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32);
3449 return true;
3450}
3451
3452bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods(SDValue In, SDValue &Src,
3453 SDValue &SrcMods) const {
3454 // FIXME: Handle op_sel
3455 return SelectVOP3Mods(In, Src, SrcMods);
3456}
3457
3458// The return value is not whether the match is possible (which it always is),
3459// but whether or not it a conversion is really used.
3460bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src,
3461 unsigned &Mods) const {
3462 Mods = 0;
3463 SelectVOP3ModsImpl(In, Src, Mods);
3464
3465 if (Src.getOpcode() == ISD::FP_EXTEND) {
3466 Src = Src.getOperand(0);
3467 assert(Src.getValueType() == MVT::f16);
3468 Src = stripBitcast(Src);
3469
3470 // Be careful about folding modifiers if we already have an abs. fneg is
3471 // applied last, so we don't want to apply an earlier fneg.
3472 if ((Mods & SISrcMods::ABS) == 0) {
3473 unsigned ModsTmp;
3474 SelectVOP3ModsImpl(Src, Src, ModsTmp);
3475
3476 if ((ModsTmp & SISrcMods::NEG) != 0)
3477 Mods ^= SISrcMods::NEG;
3478
3479 if ((ModsTmp & SISrcMods::ABS) != 0)
3480 Mods |= SISrcMods::ABS;
3481 }
3482
3483 // op_sel/op_sel_hi decide the source type and source.
3484 // If the source's op_sel_hi is set, it indicates to do a conversion from fp16.
3485 // If the sources's op_sel is set, it picks the high half of the source
3486 // register.
3487
3488 Mods |= SISrcMods::OP_SEL_1;
3489 if (isExtractHiElt(Src, Src)) {
3490 Mods |= SISrcMods::OP_SEL_0;
3491
3492 // TODO: Should we try to look for neg/abs here?
3493 }
3494
3495 return true;
3496 }
3497
3498 return false;
3499}
3500
3501bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsExt(SDValue In, SDValue &Src,
3502 SDValue &SrcMods) const {
3503 unsigned Mods = 0;
3504 if (!SelectVOP3PMadMixModsImpl(In, Src, Mods))
3505 return false;
3506 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3507 return true;
3508}
3509
3510bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src,
3511 SDValue &SrcMods) const {
3512 unsigned Mods = 0;
3513 SelectVOP3PMadMixModsImpl(In, Src, Mods);
3514 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3515 return true;
3516}
3517
3518SDValue AMDGPUDAGToDAGISel::getHi16Elt(SDValue In) const {
3519 if (In.isUndef())
3520 return CurDAG->getUNDEF(MVT::i32);
3521
3522 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(In)) {
3523 SDLoc SL(In);
3524 return CurDAG->getConstant(C->getZExtValue() << 16, SL, MVT::i32);
3525 }
3526
3527 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(In)) {
3528 SDLoc SL(In);
3529 return CurDAG->getConstant(
3530 C->getValueAPF().bitcastToAPInt().getZExtValue() << 16, SL, MVT::i32);
3531 }
3532
3533 SDValue Src;
3534 if (isExtractHiElt(In, Src))
3535 return Src;
3536
3537 return SDValue();
3538}
3539
3540bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
3542
3543 const SIRegisterInfo *SIRI =
3544 static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
3545 const SIInstrInfo * SII =
3546 static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
3547
3548 unsigned Limit = 0;
3549 bool AllUsesAcceptSReg = true;
3550 for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
3551 Limit < 10 && U != E; ++U, ++Limit) {
3552 const TargetRegisterClass *RC = getOperandRegClass(*U, U.getOperandNo());
3553
3554 // If the register class is unknown, it could be an unknown
3555 // register class that needs to be an SGPR, e.g. an inline asm
3556 // constraint
3557 if (!RC || SIRI->isSGPRClass(RC))
3558 return false;
3559
3560 if (RC != &AMDGPU::VS_32RegClass && RC != &AMDGPU::VS_64RegClass) {
3561 AllUsesAcceptSReg = false;
3562 SDNode * User = *U;
3563 if (User->isMachineOpcode()) {
3564 unsigned Opc = User->getMachineOpcode();
3565 const MCInstrDesc &Desc = SII->get(Opc);
3566 if (Desc.isCommutable()) {
3567 unsigned OpIdx = Desc.getNumDefs() + U.getOperandNo();
3568 unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex;
3569 if (SII->findCommutedOpIndices(Desc, OpIdx, CommuteIdx1)) {
3570 unsigned CommutedOpNo = CommuteIdx1 - Desc.getNumDefs();
3571 const TargetRegisterClass *CommutedRC = getOperandRegClass(*U, CommutedOpNo);
3572 if (CommutedRC == &AMDGPU::VS_32RegClass ||
3573 CommutedRC == &AMDGPU::VS_64RegClass)
3574 AllUsesAcceptSReg = true;
3575 }
3576 }
3577 }
3578 // If "AllUsesAcceptSReg == false" so far we haven't succeeded
3579 // commuting current user. This means have at least one use
3580 // that strictly require VGPR. Thus, we will not attempt to commute
3581 // other user instructions.
3582 if (!AllUsesAcceptSReg)
3583 break;
3584 }
3585 }
3586 return !AllUsesAcceptSReg && (Limit < 10);
3587}
3588
3589bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode *N) const {
3590 auto Ld = cast<LoadSDNode>(N);
3591
3592 const MachineMemOperand *MMO = Ld->getMemOperand();
3593 if (N->isDivergent() && !AMDGPUInstrInfo::isUniformMMO(MMO))
3594 return false;
3595
3596 return MMO->getSize().hasValue() &&
3597 Ld->getAlign() >=
3598 Align(std::min(MMO->getSize().getValue().getKnownMinValue(),
3599 uint64_t(4))) &&
3600 ((Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
3601 Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ||
3602 (Subtarget->getScalarizeGlobalBehavior() &&
3603 Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
3604 Ld->isSimple() &&
3605 static_cast<const SITargetLowering *>(getTargetLowering())
3606 ->isMemOpHasNoClobberedMemOperand(N)));
3607}
3608
3611 *static_cast<const AMDGPUTargetLowering*>(getTargetLowering());
3612 bool IsModified = false;
3613 do {
3614 IsModified = false;
3615
3616 // Go over all selected nodes and try to fold them a bit more
3618 while (Position != CurDAG->allnodes_end()) {
3619 SDNode *Node = &*Position++;
3620 MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(Node);
3621 if (!MachineNode)
3622 continue;
3623
3624 SDNode *ResNode = Lowering.PostISelFolding(MachineNode, *CurDAG);
3625 if (ResNode != Node) {
3626 if (ResNode)
3627 ReplaceUses(Node, ResNode);
3628 IsModified = true;
3629 }
3630 }
3632 } while (IsModified);
3633}
3634
3636 CodeGenOptLevel OptLevel)
3638 ID, std::make_unique<AMDGPUDAGToDAGISel>(TM, OptLevel)) {}
3639
unsigned const MachineRegisterInfo * MRI
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool getBaseWithOffsetUsingSplitOR(SelectionDAG &DAG, SDValue Addr, SDValue &N0, SDValue &N1)
static MachineSDNode * buildRegSequence32(SmallVectorImpl< SDValue > &Elts, llvm::SelectionDAG *CurDAG, const SDLoc &DL)
static SDValue matchZExtFromI32(SDValue Op)
static unsigned gwsIntrinToOpcode(unsigned IntrID)
static MachineSDNode * buildRegSequence(SmallVectorImpl< SDValue > &Elts, llvm::SelectionDAG *CurDAG, const SDLoc &DL, unsigned ElementSize)
static SDValue SelectSAddrFI(SelectionDAG *CurDAG, SDValue SAddr)
static MemSDNode * findMemSDNode(SDNode *N)
static bool isNoUnsignedWrap(SDValue Addr)
static MachineSDNode * buildRegSequence16(SmallVectorImpl< SDValue > &Elts, llvm::SelectionDAG *CurDAG, const SDLoc &DL)
static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods, SmallVectorImpl< SDValue > &Elts, SDValue &Src, llvm::SelectionDAG *CurDAG, const SDLoc &DL, unsigned ElementSize)
static bool IsCopyFromSGPR(const SIRegisterInfo &TRI, SDValue Val)
static SDValue combineBallotPattern(SDValue VCMP, bool &Negate)
static void checkWMMAElementsModifiersF16(BuildVectorSDNode *BV, std::function< bool(SDValue)> ModifierCheck)
Defines an instruction selector for the AMDGPU target.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In, Register &Out)
static unsigned gwsIntrinToOpcode(unsigned IntrID)
static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods, SmallVectorImpl< Register > &Elts, Register &Src, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
Provides AMDGPU specific target descriptions.
Base class for AMDGPU specific classes of TargetSubtarget.
The AMDGPU TargetMachine interface definition for hw codegen targets.
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Addr
uint64_t Size
#define F(x, y, z)
Definition: MD5.cpp:55
unsigned const TargetRegisterInfo * TRI
if(VerifyEach)
FunctionAnalysisManager FAM
const char LLVMTargetMachineRef TM
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:59
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
pre isel intrinsic Pre ISel Intrinsic Lowering
Provides R600 specific target descriptions.
Interface definition for R600RegisterInfo.
const SmallVectorImpl< MachineOperand > & Cond
SI DAG Lowering interface definition.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
LLVM IR instance of the generic uniformity analysis.
Value * RHS
Value * LHS
support::ulittle16_t & Lo
Definition: aarch32.cpp:206
support::ulittle16_t & Hi
Definition: aarch32.cpp:205
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
AMDGPUDAGToDAGISelLegacy(TargetMachine &TM, CodeGenOptLevel OptLevel)
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
StringRef getPassName() const override
getPassName - Return a nice clean name for a pass.
AMDGPU specific code to select AMDGPU machine instructions for SelectionDAG operations.
void SelectBuildVector(SDNode *N, unsigned RegClassID)
void Select(SDNode *N) override
Main hook for targets to transform nodes into machine nodes.
bool runOnMachineFunction(MachineFunction &MF) override
void PreprocessISelDAG() override
PreprocessISelDAG - This hook allows targets to hack on the graph before instruction selection starts...
void PostprocessISelDAG() override
PostprocessISelDAG() - This hook allows the target to hack on the graph right after selection.
AMDGPUDAGToDAGISel()=delete
bool matchLoadD16FromBuildVector(SDNode *N) const
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
AMDGPUISelDAGToDAGPass(TargetMachine &TM)
static bool isUniformMMO(const MachineMemOperand *MMO)
unsigned getWavefrontSizeLog2() const
bool hasInv2PiInlineImm() const
static SDValue stripBitcast(SDValue Val)
static int64_t getNullPointerValue(unsigned AddrSpace)
Get the integer value of a null pointer in the given address space.
Class for arbitrary precision integers.
Definition: APInt.h:77
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:213
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1499
unsigned countr_one() const
Count the number of trailing one bits.
Definition: APInt.h:1614
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:321
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:473
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
LLVM Basic Block Representation.
Definition: BasicBlock.h:60
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:221
A "pseudo-class" with methods for operating on BUILD_VECTORs.
SDValue getSplatValue(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted value or a null value if this is not a splat.
uint64_t getZExtValue() const
int64_t getSExtValue() const
This class represents an Operation in the Expression.
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:279
Legacy analysis pass which computes a DominatorTree.
Definition: Dominators.h:317
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:311
int getLDSBankCount() const
Definition: GCNSubtarget.h:331
bool hasUsableDSOffset() const
True if the offset field of DS instructions works as expected.
Definition: GCNSubtarget.h:459
bool unsafeDSOffsetFoldingEnabled() const
Definition: GCNSubtarget.h:463
bool hasFlatInstOffsets() const
Definition: GCNSubtarget.h:618
bool hasDLInsts() const
Definition: GCNSubtarget.h:759
const SIInstrInfo * getInstrInfo() const override
Definition: GCNSubtarget.h:257
unsigned getConstantBusLimit(unsigned Opcode) const
bool hasMADIntraFwdBug() const
bool privateMemoryResourceIsRangeChecked() const
Definition: GCNSubtarget.h:544
bool hasSignedScratchOffsets() const
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:269
bool hasDOTOpSelHazard() const
bool d16PreservesUnusedBits() const
Definition: GCNSubtarget.h:686
bool hasRestrictedSOffset() const
bool hasFlatSegmentOffsetBug() const
Definition: GCNSubtarget.h:674
bool getScalarizeGlobalBehavior() const
Definition: GCNSubtarget.h:921
bool ldsRequiresM0Init() const
Return if most LDS instructions have an m0 use that require m0 to be initialized.
Definition: GCNSubtarget.h:696
bool hasFlatScratchSVSSwizzleBug() const
bool hasGWS() const
bool useFlatForGlobal() const
Definition: GCNSubtarget.h:522
Generation getGeneration() const
Definition: GCNSubtarget.h:308
bool hasGWSSemaReleaseAll() const
Definition: GCNSubtarget.h:710
bool hasAddr64() const
Definition: GCNSubtarget.h:372
bool hasAddNoCarry() const
Definition: GCNSubtarget.h:718
bool hasSALUFloatInsts() const
void checkSubtargetFeatures(const Function &F) const
Diagnose inconsistent subtarget features before attempting to codegen function F.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
bool hasValue() const
TypeSize getValue() const
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:566
SmallVector< LoopT *, 4 > getLoopsInPreorder() const
Return all of the loops in the function in preorder across the loop nests, with siblings in forward p...
The legacy pass manager's analysis pass to compute loop information.
Definition: LoopInfo.h:593
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
const Triple & getTargetTriple() const
Machine Value Type.
static MVT getIntegerVT(unsigned BitWidth)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
An SDNode that represents everything that will be needed to construct a MachineInstr.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getChain() const
EVT getMemoryVT() const
Return the type of the in-memory value.
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:109
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
This class provides iterator support for SDUse operands that use a specific SDNode.
Represents one node in the SelectionDAG.
const APInt & getAsAPIntVal() const
Helper method returns the APInt value of a ConstantSDNode.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool isDivergent() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumOperands() const
Return the number of values used by this operation.
const SDValue & getOperand(unsigned Num) const
bool isPredecessorOf(const SDNode *N) const
Return true if this node is a predecessor of N.
static use_iterator use_end()
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
const SDValue & getOperand(unsigned i) const
uint64_t getConstantOperandVal(unsigned i) const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx0, unsigned &SrcOpIdx1) const override
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
const TargetRegisterClass * getRegClass(unsigned RCID) const
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isSGPRClass(const TargetRegisterClass *RC)
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
SelectionDAGISel - This is the common base class used for SelectionDAG-based pattern-matching instruc...
std::unique_ptr< FunctionLoweringInfo > FuncInfo
const TargetLowering * TLI
MachineFunction * MF
const TargetInstrInfo * TII
void ReplaceUses(SDValue F, SDValue T)
ReplaceUses - replace all uses of the old node F with the use of the new node T.
void ReplaceNode(SDNode *F, SDNode *T)
Replace all uses of F with T, then remove F from the DAG.
virtual bool runOnMachineFunction(MachineFunction &mf)
const TargetLowering * getTargetLowering() const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:227
const TargetSubtargetInfo & getSubtarget() const
Definition: SelectionDAG.h:488
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
SDNode * SelectNodeTo(SDNode *N, unsigned MachineOpc, EVT VT)
These are used for target selectors to mutate the specified node to have the specified return type,...
SDNode * MorphNodeTo(SDNode *N, unsigned Opc, SDVTList VTs, ArrayRef< SDValue > Ops)
This mutates the specified node to have the specified return type, opcode, and operands.
allnodes_const_iterator allnodes_begin() const
Definition: SelectionDAG.h:545
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
allnodes_const_iterator allnodes_end() const
Definition: SelectionDAG.h:546
void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
SDValue getTargetFrameIndex(int FI, EVT VT)
Definition: SelectionDAG.h:741
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
bool SignBitIsZero(SDValue Op, unsigned Depth=0) const
Return true if the sign bit of Op is known to be zero.
SDValue getRegister(unsigned Reg, EVT VT)
void RemoveDeadNodes()
This method deletes all unreachable nodes in the SelectionDAG.
void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
const TargetMachine & getTarget() const
Definition: SelectionDAG.h:487
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, unsigned Reg, SDValue N)
Definition: SelectionDAG.h:787
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:690
bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:481
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition: SelectionDAG.h:568
ilist< SDNode >::iterator allnodes_iterator
Definition: SelectionDAG.h:548
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
static const unsigned CommuteAnyOperandIndex
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:76
const Triple & getTargetTriple() const
unsigned getID() const
Return the register class ID number.
ArchType getArch() const
Get the parsed architecture type of this triple.
Definition: Triple.h:373
Legacy analysis pass which computes a CycleInfo.
LLVM Value Representation.
Definition: Value.h:74
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:168
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
@ CLAMP
CLAMP value between 0.0 and 1.0.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
Key
PAL metadata keys.
std::optional< int64_t > getSMRDEncodedLiteralOffset32(const MCSubtargetInfo &ST, int64_t ByteOffset)
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isValid32BitLiteral(uint64_t Val, bool IsFP64)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
std::optional< int64_t > getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset, bool IsBuffer)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:752
@ STACKRESTORE
STACKRESTORE has two operands, an input chain and a pointer to restore to it returns an output chain.
Definition: ISDOpcodes.h:1135
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:251
@ ConstantFP
Definition: ISDOpcodes.h:77
@ ADDC
Carry-setting nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:270
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition: ISDOpcodes.h:489
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:240
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1040
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:485
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition: ISDOpcodes.h:199
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:792
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:391
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:905
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition: ISDOpcodes.h:230
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
Definition: ISDOpcodes.h:942
@ CONVERGENCECTRL_GLUE
Definition: ISDOpcodes.h:1405
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition: ISDOpcodes.h:622
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:932
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition: ISDOpcodes.h:502
@ UNDEF
UNDEF - An undefined node.
Definition: ISDOpcodes.h:212
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition: ISDOpcodes.h:209
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:707
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:537
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:203
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:782
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:974
@ TargetFrameIndex
Definition: ISDOpcodes.h:166
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:800
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:890
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:304
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:682
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:184
@ ADDE
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:280
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:871
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:788
@ BRCOND
BRCOND - Conditional branch.
Definition: ISDOpcodes.h:1079
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:192
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition: ISDOpcodes.h:517
bool isExtOpcode(unsigned Opcode)
Definition: ISDOpcodes.h:1613
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1542
@ Undef
Value of the register doesn't matter.
Reg
All possible values of the reg field in the ModR/M byte.
constexpr const char32_t SBase
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Offset
Definition: DWP.cpp:456
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
constexpr bool isMask_32(uint32_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition: MathExtras.h:251
bool isBoolSGPR(SDValue V)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:138
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163