LLVM 19.0.0git
AMDGPUISelDAGToDAG.cpp
Go to the documentation of this file.
1//===-- AMDGPUISelDAGToDAG.cpp - A dag to dag inst selector for AMDGPU ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//==-----------------------------------------------------------------------===//
8//
9/// \file
10/// Defines an instruction selector for the AMDGPU target.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPUISelDAGToDAG.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPUSubtarget.h"
18#include "AMDGPUTargetMachine.h"
21#include "R600RegisterInfo.h"
22#include "SIISelLowering.h"
30#include "llvm/IR/IntrinsicsAMDGPU.h"
33
34#ifdef EXPENSIVE_CHECKS
36#include "llvm/IR/Dominators.h"
37#endif
38
39#define DEBUG_TYPE "amdgpu-isel"
40
41using namespace llvm;
42
43//===----------------------------------------------------------------------===//
44// Instruction Selector Implementation
45//===----------------------------------------------------------------------===//
46
47namespace {
48static SDValue stripBitcast(SDValue Val) {
49 return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val;
50}
51
52// Figure out if this is really an extract of the high 16-bits of a dword.
53static bool isExtractHiElt(SDValue In, SDValue &Out) {
54 In = stripBitcast(In);
55
56 if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
57 if (ConstantSDNode *Idx = dyn_cast<ConstantSDNode>(In.getOperand(1))) {
58 if (!Idx->isOne())
59 return false;
60 Out = In.getOperand(0);
61 return true;
62 }
63 }
64
65 if (In.getOpcode() != ISD::TRUNCATE)
66 return false;
67
68 SDValue Srl = In.getOperand(0);
69 if (Srl.getOpcode() == ISD::SRL) {
70 if (ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
71 if (ShiftAmt->getZExtValue() == 16) {
72 Out = stripBitcast(Srl.getOperand(0));
73 return true;
74 }
75 }
76 }
77
78 return false;
79}
80
81// Look through operations that obscure just looking at the low 16-bits of the
82// same register.
83static SDValue stripExtractLoElt(SDValue In) {
84 if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
85 SDValue Idx = In.getOperand(1);
86 if (isNullConstant(Idx) && In.getValueSizeInBits() <= 32)
87 return In.getOperand(0);
88 }
89
90 if (In.getOpcode() == ISD::TRUNCATE) {
91 SDValue Src = In.getOperand(0);
92 if (Src.getValueType().getSizeInBits() == 32)
93 return stripBitcast(Src);
94 }
95
96 return In;
97}
98
99} // end anonymous namespace
100
102 "AMDGPU DAG->DAG Pattern Instruction Selection", false,
103 false)
107#ifdef EXPENSIVE_CHECKS
110#endif
112 "AMDGPU DAG->DAG Pattern Instruction Selection", false,
113 false)
114
115/// This pass converts a legalized DAG into a AMDGPU-specific
116// DAG, ready for instruction scheduling.
118 CodeGenOptLevel OptLevel) {
119 return new AMDGPUDAGToDAGISelLegacy(TM, OptLevel);
120}
121
123 CodeGenOptLevel OptLevel)
124 : SelectionDAGISel(TM, OptLevel) {
125 EnableLateStructurizeCFG = AMDGPUTargetMachine::EnableLateStructurizeCFG;
126}
127
129 Subtarget = &MF.getSubtarget<GCNSubtarget>();
131 Mode = SIModeRegisterDefaults(MF.getFunction(), *Subtarget);
133}
134
135bool AMDGPUDAGToDAGISel::fp16SrcZerosHighBits(unsigned Opc) const {
136 // XXX - only need to list legal operations.
137 switch (Opc) {
138 case ISD::FADD:
139 case ISD::FSUB:
140 case ISD::FMUL:
141 case ISD::FDIV:
142 case ISD::FREM:
144 case ISD::UINT_TO_FP:
145 case ISD::SINT_TO_FP:
146 case ISD::FABS:
147 // Fabs is lowered to a bit operation, but it's an and which will clear the
148 // high bits anyway.
149 case ISD::FSQRT:
150 case ISD::FSIN:
151 case ISD::FCOS:
152 case ISD::FPOWI:
153 case ISD::FPOW:
154 case ISD::FLOG:
155 case ISD::FLOG2:
156 case ISD::FLOG10:
157 case ISD::FEXP:
158 case ISD::FEXP2:
159 case ISD::FCEIL:
160 case ISD::FTRUNC:
161 case ISD::FRINT:
162 case ISD::FNEARBYINT:
163 case ISD::FROUNDEVEN:
164 case ISD::FROUND:
165 case ISD::FFLOOR:
166 case ISD::FMINNUM:
167 case ISD::FMAXNUM:
168 case ISD::FLDEXP:
169 case AMDGPUISD::FRACT:
170 case AMDGPUISD::CLAMP:
173 case AMDGPUISD::FMIN3:
174 case AMDGPUISD::FMAX3:
175 case AMDGPUISD::FMED3:
177 case AMDGPUISD::RCP:
178 case AMDGPUISD::RSQ:
180 // On gfx10, all 16-bit instructions preserve the high bits.
181 return Subtarget->getGeneration() <= AMDGPUSubtarget::GFX9;
182 case ISD::FP_ROUND:
183 // We may select fptrunc (fma/mad) to mad_mixlo, which does not zero the
184 // high bits on gfx9.
185 // TODO: If we had the source node we could see if the source was fma/mad
187 case ISD::FMA:
188 case ISD::FMAD:
191 default:
192 // fcopysign, select and others may be lowered to 32-bit bit operations
193 // which don't zero the high bits.
194 return false;
195 }
196}
197
199#ifdef EXPENSIVE_CHECKS
200 DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
201 LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
202 for (auto &L : LI->getLoopsInPreorder()) {
203 assert(L->isLCSSAForm(DT));
204 }
205#endif
207}
208
212#ifdef EXPENSIVE_CHECKS
215#endif
217}
218
220 assert(Subtarget->d16PreservesUnusedBits());
221 MVT VT = N->getValueType(0).getSimpleVT();
222 if (VT != MVT::v2i16 && VT != MVT::v2f16)
223 return false;
224
225 SDValue Lo = N->getOperand(0);
226 SDValue Hi = N->getOperand(1);
227
228 LoadSDNode *LdHi = dyn_cast<LoadSDNode>(stripBitcast(Hi));
229
230 // build_vector lo, (load ptr) -> load_d16_hi ptr, lo
231 // build_vector lo, (zextload ptr from i8) -> load_d16_hi_u8 ptr, lo
232 // build_vector lo, (sextload ptr from i8) -> load_d16_hi_i8 ptr, lo
233
234 // Need to check for possible indirect dependencies on the other half of the
235 // vector to avoid introducing a cycle.
236 if (LdHi && Hi.hasOneUse() && !LdHi->isPredecessorOf(Lo.getNode())) {
237 SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
238
240 SDValue Ops[] = {
241 LdHi->getChain(), LdHi->getBasePtr(), TiedIn
242 };
243
244 unsigned LoadOp = AMDGPUISD::LOAD_D16_HI;
245 if (LdHi->getMemoryVT() == MVT::i8) {
246 LoadOp = LdHi->getExtensionType() == ISD::SEXTLOAD ?
248 } else {
249 assert(LdHi->getMemoryVT() == MVT::i16);
250 }
251
252 SDValue NewLoadHi =
253 CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdHi), VTList,
254 Ops, LdHi->getMemoryVT(),
255 LdHi->getMemOperand());
256
257 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadHi);
258 CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdHi, 1), NewLoadHi.getValue(1));
259 return true;
260 }
261
262 // build_vector (load ptr), hi -> load_d16_lo ptr, hi
263 // build_vector (zextload ptr from i8), hi -> load_d16_lo_u8 ptr, hi
264 // build_vector (sextload ptr from i8), hi -> load_d16_lo_i8 ptr, hi
265 LoadSDNode *LdLo = dyn_cast<LoadSDNode>(stripBitcast(Lo));
266 if (LdLo && Lo.hasOneUse()) {
267 SDValue TiedIn = getHi16Elt(Hi);
268 if (!TiedIn || LdLo->isPredecessorOf(TiedIn.getNode()))
269 return false;
270
271 SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
272 unsigned LoadOp = AMDGPUISD::LOAD_D16_LO;
273 if (LdLo->getMemoryVT() == MVT::i8) {
274 LoadOp = LdLo->getExtensionType() == ISD::SEXTLOAD ?
276 } else {
277 assert(LdLo->getMemoryVT() == MVT::i16);
278 }
279
280 TiedIn = CurDAG->getNode(ISD::BITCAST, SDLoc(N), VT, TiedIn);
281
282 SDValue Ops[] = {
283 LdLo->getChain(), LdLo->getBasePtr(), TiedIn
284 };
285
286 SDValue NewLoadLo =
287 CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdLo), VTList,
288 Ops, LdLo->getMemoryVT(),
289 LdLo->getMemOperand());
290
291 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadLo);
292 CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdLo, 1), NewLoadLo.getValue(1));
293 return true;
294 }
295
296 return false;
297}
298
300 if (!Subtarget->d16PreservesUnusedBits())
301 return;
302
304
305 bool MadeChange = false;
306 while (Position != CurDAG->allnodes_begin()) {
307 SDNode *N = &*--Position;
308 if (N->use_empty())
309 continue;
310
311 switch (N->getOpcode()) {
313 // TODO: Match load d16 from shl (extload:i16), 16
314 MadeChange |= matchLoadD16FromBuildVector(N);
315 break;
316 default:
317 break;
318 }
319 }
320
321 if (MadeChange) {
323 LLVM_DEBUG(dbgs() << "After PreProcess:\n";
324 CurDAG->dump(););
325 }
326}
327
328bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N) const {
329 if (N->isUndef())
330 return true;
331
332 const SIInstrInfo *TII = Subtarget->getInstrInfo();
333 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N))
334 return TII->isInlineConstant(C->getAPIntValue());
335
336 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N))
337 return TII->isInlineConstant(C->getValueAPF());
338
339 return false;
340}
341
342/// Determine the register class for \p OpNo
343/// \returns The register class of the virtual register that will be used for
344/// the given operand number \OpNo or NULL if the register class cannot be
345/// determined.
346const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
347 unsigned OpNo) const {
348 if (!N->isMachineOpcode()) {
349 if (N->getOpcode() == ISD::CopyToReg) {
350 Register Reg = cast<RegisterSDNode>(N->getOperand(1))->getReg();
351 if (Reg.isVirtual()) {
353 return MRI.getRegClass(Reg);
354 }
355
356 const SIRegisterInfo *TRI
357 = static_cast<const GCNSubtarget *>(Subtarget)->getRegisterInfo();
358 return TRI->getPhysRegBaseClass(Reg);
359 }
360
361 return nullptr;
362 }
363
364 switch (N->getMachineOpcode()) {
365 default: {
366 const MCInstrDesc &Desc =
367 Subtarget->getInstrInfo()->get(N->getMachineOpcode());
368 unsigned OpIdx = Desc.getNumDefs() + OpNo;
369 if (OpIdx >= Desc.getNumOperands())
370 return nullptr;
371 int RegClass = Desc.operands()[OpIdx].RegClass;
372 if (RegClass == -1)
373 return nullptr;
374
375 return Subtarget->getRegisterInfo()->getRegClass(RegClass);
376 }
377 case AMDGPU::REG_SEQUENCE: {
378 unsigned RCID = N->getConstantOperandVal(0);
379 const TargetRegisterClass *SuperRC =
380 Subtarget->getRegisterInfo()->getRegClass(RCID);
381
382 SDValue SubRegOp = N->getOperand(OpNo + 1);
383 unsigned SubRegIdx = SubRegOp->getAsZExtVal();
384 return Subtarget->getRegisterInfo()->getSubClassWithSubReg(SuperRC,
385 SubRegIdx);
386 }
387 }
388}
389
390SDNode *AMDGPUDAGToDAGISel::glueCopyToOp(SDNode *N, SDValue NewChain,
391 SDValue Glue) const {
392 SmallVector <SDValue, 8> Ops;
393 Ops.push_back(NewChain); // Replace the chain.
394 for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
395 Ops.push_back(N->getOperand(i));
396
397 Ops.push_back(Glue);
398 return CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops);
399}
400
401SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N, SDValue Val) const {
403 *static_cast<const SITargetLowering*>(getTargetLowering());
404
405 assert(N->getOperand(0).getValueType() == MVT::Other && "Expected chain");
406
407 SDValue M0 = Lowering.copyToM0(*CurDAG, N->getOperand(0), SDLoc(N), Val);
408 return glueCopyToOp(N, M0, M0.getValue(1));
409}
410
411SDNode *AMDGPUDAGToDAGISel::glueCopyToM0LDSInit(SDNode *N) const {
412 unsigned AS = cast<MemSDNode>(N)->getAddressSpace();
413 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
414 if (Subtarget->ldsRequiresM0Init())
415 return glueCopyToM0(N, CurDAG->getTargetConstant(-1, SDLoc(N), MVT::i32));
416 } else if (AS == AMDGPUAS::REGION_ADDRESS) {
418 unsigned Value = MF.getInfo<SIMachineFunctionInfo>()->getGDSSize();
419 return
420 glueCopyToM0(N, CurDAG->getTargetConstant(Value, SDLoc(N), MVT::i32));
421 }
422 return N;
423}
424
425MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm,
426 EVT VT) const {
428 AMDGPU::S_MOV_B32, DL, MVT::i32,
429 CurDAG->getTargetConstant(Imm & 0xFFFFFFFF, DL, MVT::i32));
430 SDNode *Hi =
431 CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
432 CurDAG->getTargetConstant(Imm >> 32, DL, MVT::i32));
433 const SDValue Ops[] = {
434 CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
435 SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
436 SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
437
438 return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, VT, Ops);
439}
440
441void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
442 EVT VT = N->getValueType(0);
443 unsigned NumVectorElts = VT.getVectorNumElements();
444 EVT EltVT = VT.getVectorElementType();
445 SDLoc DL(N);
446 SDValue RegClass = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
447
448 if (NumVectorElts == 1) {
449 CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT, N->getOperand(0),
450 RegClass);
451 return;
452 }
453
454 assert(NumVectorElts <= 32 && "Vectors with more than 32 elements not "
455 "supported yet");
456 // 32 = Max Num Vector Elements
457 // 2 = 2 REG_SEQUENCE operands per element (value, subreg index)
458 // 1 = Vector Register Class
459 SmallVector<SDValue, 32 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1);
460
461 bool IsGCN = CurDAG->getSubtarget().getTargetTriple().getArch() ==
463 RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
464 bool IsRegSeq = true;
465 unsigned NOps = N->getNumOperands();
466 for (unsigned i = 0; i < NOps; i++) {
467 // XXX: Why is this here?
468 if (isa<RegisterSDNode>(N->getOperand(i))) {
469 IsRegSeq = false;
470 break;
471 }
472 unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
474 RegSeqArgs[1 + (2 * i)] = N->getOperand(i);
475 RegSeqArgs[1 + (2 * i) + 1] = CurDAG->getTargetConstant(Sub, DL, MVT::i32);
476 }
477 if (NOps != NumVectorElts) {
478 // Fill in the missing undef elements if this was a scalar_to_vector.
479 assert(N->getOpcode() == ISD::SCALAR_TO_VECTOR && NOps < NumVectorElts);
480 MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
481 DL, EltVT);
482 for (unsigned i = NOps; i < NumVectorElts; ++i) {
483 unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
485 RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0);
486 RegSeqArgs[1 + (2 * i) + 1] =
487 CurDAG->getTargetConstant(Sub, DL, MVT::i32);
488 }
489 }
490
491 if (!IsRegSeq)
492 SelectCode(N);
493 CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), RegSeqArgs);
494}
495
497 unsigned int Opc = N->getOpcode();
498 if (N->isMachineOpcode()) {
499 N->setNodeId(-1);
500 return; // Already selected.
501 }
502
503 // isa<MemSDNode> almost works but is slightly too permissive for some DS
504 // intrinsics.
505 if (Opc == ISD::LOAD || Opc == ISD::STORE || isa<AtomicSDNode>(N)) {
506 N = glueCopyToM0LDSInit(N);
507 SelectCode(N);
508 return;
509 }
510
511 switch (Opc) {
512 default:
513 break;
514 // We are selecting i64 ADD here instead of custom lower it during
515 // DAG legalization, so we can fold some i64 ADDs used for address
516 // calculation into the LOAD and STORE instructions.
517 case ISD::ADDC:
518 case ISD::ADDE:
519 case ISD::SUBC:
520 case ISD::SUBE: {
521 if (N->getValueType(0) != MVT::i64)
522 break;
523
524 SelectADD_SUB_I64(N);
525 return;
526 }
527 case ISD::UADDO_CARRY:
528 case ISD::USUBO_CARRY:
529 if (N->getValueType(0) != MVT::i32)
530 break;
531
532 SelectAddcSubb(N);
533 return;
534 case ISD::UADDO:
535 case ISD::USUBO: {
536 SelectUADDO_USUBO(N);
537 return;
538 }
540 SelectFMUL_W_CHAIN(N);
541 return;
542 }
544 SelectFMA_W_CHAIN(N);
545 return;
546 }
547
549 case ISD::BUILD_VECTOR: {
550 EVT VT = N->getValueType(0);
551 unsigned NumVectorElts = VT.getVectorNumElements();
552 if (VT.getScalarSizeInBits() == 16) {
553 if (Opc == ISD::BUILD_VECTOR && NumVectorElts == 2) {
554 if (SDNode *Packed = packConstantV2I16(N, *CurDAG)) {
555 ReplaceNode(N, Packed);
556 return;
557 }
558 }
559
560 break;
561 }
562
563 assert(VT.getVectorElementType().bitsEq(MVT::i32));
564 unsigned RegClassID =
565 SIRegisterInfo::getSGPRClassForBitWidth(NumVectorElts * 32)->getID();
566 SelectBuildVector(N, RegClassID);
567 return;
568 }
569 case ISD::BUILD_PAIR: {
570 SDValue RC, SubReg0, SubReg1;
571 SDLoc DL(N);
572 if (N->getValueType(0) == MVT::i128) {
573 RC = CurDAG->getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32);
574 SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32);
575 SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32);
576 } else if (N->getValueType(0) == MVT::i64) {
577 RC = CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32);
578 SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
579 SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
580 } else {
581 llvm_unreachable("Unhandled value type for BUILD_PAIR");
582 }
583 const SDValue Ops[] = { RC, N->getOperand(0), SubReg0,
584 N->getOperand(1), SubReg1 };
585 ReplaceNode(N, CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL,
586 N->getValueType(0), Ops));
587 return;
588 }
589
590 case ISD::Constant:
591 case ISD::ConstantFP: {
592 if (N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N))
593 break;
594
595 uint64_t Imm;
596 if (ConstantFPSDNode *FP = dyn_cast<ConstantFPSDNode>(N)) {
597 Imm = FP->getValueAPF().bitcastToAPInt().getZExtValue();
598 if (AMDGPU::isValid32BitLiteral(Imm, true))
599 break;
600 } else {
601 ConstantSDNode *C = cast<ConstantSDNode>(N);
602 Imm = C->getZExtValue();
603 if (AMDGPU::isValid32BitLiteral(Imm, false))
604 break;
605 }
606
607 SDLoc DL(N);
608 ReplaceNode(N, buildSMovImm64(DL, Imm, N->getValueType(0)));
609 return;
610 }
612 case AMDGPUISD::BFE_U32: {
613 // There is a scalar version available, but unlike the vector version which
614 // has a separate operand for the offset and width, the scalar version packs
615 // the width and offset into a single operand. Try to move to the scalar
616 // version if the offsets are constant, so that we can try to keep extended
617 // loads of kernel arguments in SGPRs.
618
619 // TODO: Technically we could try to pattern match scalar bitshifts of
620 // dynamic values, but it's probably not useful.
621 ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
622 if (!Offset)
623 break;
624
625 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
626 if (!Width)
627 break;
628
629 bool Signed = Opc == AMDGPUISD::BFE_I32;
630
631 uint32_t OffsetVal = Offset->getZExtValue();
632 uint32_t WidthVal = Width->getZExtValue();
633
634 ReplaceNode(N, getBFE32(Signed, SDLoc(N), N->getOperand(0), OffsetVal,
635 WidthVal));
636 return;
637 }
639 SelectDIV_SCALE(N);
640 return;
641 }
644 SelectMAD_64_32(N);
645 return;
646 }
647 case ISD::SMUL_LOHI:
648 case ISD::UMUL_LOHI:
649 return SelectMUL_LOHI(N);
650 case ISD::CopyToReg: {
652 *static_cast<const SITargetLowering*>(getTargetLowering());
653 N = Lowering.legalizeTargetIndependentNode(N, *CurDAG);
654 break;
655 }
656 case ISD::AND:
657 case ISD::SRL:
658 case ISD::SRA:
660 if (N->getValueType(0) != MVT::i32)
661 break;
662
663 SelectS_BFE(N);
664 return;
665 case ISD::BRCOND:
666 SelectBRCOND(N);
667 return;
668 case ISD::FP_EXTEND:
669 SelectFP_EXTEND(N);
670 return;
676 // Hack around using a legal type if f16 is illegal.
677 if (N->getValueType(0) == MVT::i32) {
678 MVT NewVT = Opc == AMDGPUISD::CVT_PKRTZ_F16_F32 ? MVT::v2f16 : MVT::v2i16;
679 N = CurDAG->MorphNodeTo(N, N->getOpcode(), CurDAG->getVTList(NewVT),
680 { N->getOperand(0), N->getOperand(1) });
681 SelectCode(N);
682 return;
683 }
684
685 break;
686 }
688 SelectINTRINSIC_W_CHAIN(N);
689 return;
690 }
692 SelectINTRINSIC_WO_CHAIN(N);
693 return;
694 }
695 case ISD::INTRINSIC_VOID: {
696 SelectINTRINSIC_VOID(N);
697 return;
698 }
700 SelectWAVE_ADDRESS(N);
701 return;
702 }
703 case ISD::STACKRESTORE: {
704 SelectSTACKRESTORE(N);
705 return;
706 }
707 }
708
709 SelectCode(N);
710}
711
712bool AMDGPUDAGToDAGISel::isUniformBr(const SDNode *N) const {
713 const BasicBlock *BB = FuncInfo->MBB->getBasicBlock();
714 const Instruction *Term = BB->getTerminator();
715 return Term->getMetadata("amdgpu.uniform") ||
716 Term->getMetadata("structurizecfg.uniform");
717}
718
719bool AMDGPUDAGToDAGISel::isUnneededShiftMask(const SDNode *N,
720 unsigned ShAmtBits) const {
721 assert(N->getOpcode() == ISD::AND);
722
723 const APInt &RHS = N->getConstantOperandAPInt(1);
724 if (RHS.countr_one() >= ShAmtBits)
725 return true;
726
727 const APInt &LHSKnownZeros = CurDAG->computeKnownBits(N->getOperand(0)).Zero;
728 return (LHSKnownZeros | RHS).countr_one() >= ShAmtBits;
729}
730
732 SDValue &N0, SDValue &N1) {
733 if (Addr.getValueType() == MVT::i64 && Addr.getOpcode() == ISD::BITCAST &&
734 Addr.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
735 // As we split 64-bit `or` earlier, it's complicated pattern to match, i.e.
736 // (i64 (bitcast (v2i32 (build_vector
737 // (or (extract_vector_elt V, 0), OFFSET),
738 // (extract_vector_elt V, 1)))))
739 SDValue Lo = Addr.getOperand(0).getOperand(0);
740 if (Lo.getOpcode() == ISD::OR && DAG.isBaseWithConstantOffset(Lo)) {
741 SDValue BaseLo = Lo.getOperand(0);
742 SDValue BaseHi = Addr.getOperand(0).getOperand(1);
743 // Check that split base (Lo and Hi) are extracted from the same one.
744 if (BaseLo.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
746 BaseLo.getOperand(0) == BaseHi.getOperand(0) &&
747 // Lo is statically extracted from index 0.
748 isa<ConstantSDNode>(BaseLo.getOperand(1)) &&
749 BaseLo.getConstantOperandVal(1) == 0 &&
750 // Hi is statically extracted from index 0.
751 isa<ConstantSDNode>(BaseHi.getOperand(1)) &&
752 BaseHi.getConstantOperandVal(1) == 1) {
753 N0 = BaseLo.getOperand(0).getOperand(0);
754 N1 = Lo.getOperand(1);
755 return true;
756 }
757 }
758 }
759 return false;
760}
761
762bool AMDGPUDAGToDAGISel::isBaseWithConstantOffset64(SDValue Addr, SDValue &LHS,
763 SDValue &RHS) const {
765 LHS = Addr.getOperand(0);
766 RHS = Addr.getOperand(1);
767 return true;
768 }
769
770 if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, LHS, RHS)) {
771 assert(LHS && RHS && isa<ConstantSDNode>(RHS));
772 return true;
773 }
774
775 return false;
776}
777
779 return "AMDGPU DAG->DAG Pattern Instruction Selection";
780}
781
784 std::make_unique<AMDGPUDAGToDAGISel>(TM, TM.getOptLevel())) {}
785
789#ifdef EXPENSIVE_CHECKS
791 .getManager();
792 auto &F = MF.getFunction();
795 for (auto &L : LI.getLoopsInPreorder())
796 assert(L->isLCSSAForm(DT) && "Loop is not in LCSSA form!");
797#endif
798 return SelectionDAGISelPass::run(MF, MFAM);
799}
800
801//===----------------------------------------------------------------------===//
802// Complex Patterns
803//===----------------------------------------------------------------------===//
804
805bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
806 SDValue &Offset) {
807 return false;
808}
809
810bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
811 SDValue &Offset) {
813 SDLoc DL(Addr);
814
815 if ((C = dyn_cast<ConstantSDNode>(Addr))) {
816 Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
817 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
818 } else if ((Addr.getOpcode() == AMDGPUISD::DWORDADDR) &&
819 (C = dyn_cast<ConstantSDNode>(Addr.getOperand(0)))) {
820 Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
821 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
822 } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
823 (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) {
824 Base = Addr.getOperand(0);
825 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
826 } else {
827 Base = Addr;
828 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
829 }
830
831 return true;
832}
833
834SDValue AMDGPUDAGToDAGISel::getMaterializedScalarImm32(int64_t Val,
835 const SDLoc &DL) const {
837 AMDGPU::S_MOV_B32, DL, MVT::i32,
838 CurDAG->getTargetConstant(Val, DL, MVT::i32));
839 return SDValue(Mov, 0);
840}
841
842// FIXME: Should only handle uaddo_carry/usubo_carry
843void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
844 SDLoc DL(N);
845 SDValue LHS = N->getOperand(0);
846 SDValue RHS = N->getOperand(1);
847
848 unsigned Opcode = N->getOpcode();
849 bool ConsumeCarry = (Opcode == ISD::ADDE || Opcode == ISD::SUBE);
850 bool ProduceCarry =
851 ConsumeCarry || Opcode == ISD::ADDC || Opcode == ISD::SUBC;
852 bool IsAdd = Opcode == ISD::ADD || Opcode == ISD::ADDC || Opcode == ISD::ADDE;
853
854 SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
855 SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
856
857 SDNode *Lo0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
858 DL, MVT::i32, LHS, Sub0);
859 SDNode *Hi0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
860 DL, MVT::i32, LHS, Sub1);
861
862 SDNode *Lo1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
863 DL, MVT::i32, RHS, Sub0);
864 SDNode *Hi1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
865 DL, MVT::i32, RHS, Sub1);
866
867 SDVTList VTList = CurDAG->getVTList(MVT::i32, MVT::Glue);
868
869 static const unsigned OpcMap[2][2][2] = {
870 {{AMDGPU::S_SUB_U32, AMDGPU::S_ADD_U32},
871 {AMDGPU::V_SUB_CO_U32_e32, AMDGPU::V_ADD_CO_U32_e32}},
872 {{AMDGPU::S_SUBB_U32, AMDGPU::S_ADDC_U32},
873 {AMDGPU::V_SUBB_U32_e32, AMDGPU::V_ADDC_U32_e32}}};
874
875 unsigned Opc = OpcMap[0][N->isDivergent()][IsAdd];
876 unsigned CarryOpc = OpcMap[1][N->isDivergent()][IsAdd];
877
878 SDNode *AddLo;
879 if (!ConsumeCarry) {
880 SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0) };
881 AddLo = CurDAG->getMachineNode(Opc, DL, VTList, Args);
882 } else {
883 SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0), N->getOperand(2) };
884 AddLo = CurDAG->getMachineNode(CarryOpc, DL, VTList, Args);
885 }
886 SDValue AddHiArgs[] = {
887 SDValue(Hi0, 0),
888 SDValue(Hi1, 0),
889 SDValue(AddLo, 1)
890 };
891 SDNode *AddHi = CurDAG->getMachineNode(CarryOpc, DL, VTList, AddHiArgs);
892
893 SDValue RegSequenceArgs[] = {
894 CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
895 SDValue(AddLo,0),
896 Sub0,
897 SDValue(AddHi,0),
898 Sub1,
899 };
900 SDNode *RegSequence = CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
901 MVT::i64, RegSequenceArgs);
902
903 if (ProduceCarry) {
904 // Replace the carry-use
905 ReplaceUses(SDValue(N, 1), SDValue(AddHi, 1));
906 }
907
908 // Replace the remaining uses.
909 ReplaceNode(N, RegSequence);
910}
911
912void AMDGPUDAGToDAGISel::SelectAddcSubb(SDNode *N) {
913 SDLoc DL(N);
914 SDValue LHS = N->getOperand(0);
915 SDValue RHS = N->getOperand(1);
916 SDValue CI = N->getOperand(2);
917
918 if (N->isDivergent()) {
919 unsigned Opc = N->getOpcode() == ISD::UADDO_CARRY ? AMDGPU::V_ADDC_U32_e64
920 : AMDGPU::V_SUBB_U32_e64;
922 N, Opc, N->getVTList(),
923 {LHS, RHS, CI,
924 CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
925 } else {
926 unsigned Opc = N->getOpcode() == ISD::UADDO_CARRY ? AMDGPU::S_ADD_CO_PSEUDO
927 : AMDGPU::S_SUB_CO_PSEUDO;
928 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), {LHS, RHS, CI});
929 }
930}
931
932void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) {
933 // The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned
934 // carry out despite the _i32 name. These were renamed in VI to _U32.
935 // FIXME: We should probably rename the opcodes here.
936 bool IsAdd = N->getOpcode() == ISD::UADDO;
937 bool IsVALU = N->isDivergent();
938
939 for (SDNode::use_iterator UI = N->use_begin(), E = N->use_end(); UI != E;
940 ++UI)
941 if (UI.getUse().getResNo() == 1) {
942 if ((IsAdd && (UI->getOpcode() != ISD::UADDO_CARRY)) ||
943 (!IsAdd && (UI->getOpcode() != ISD::USUBO_CARRY))) {
944 IsVALU = true;
945 break;
946 }
947 }
948
949 if (IsVALU) {
950 unsigned Opc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
951
953 N, Opc, N->getVTList(),
954 {N->getOperand(0), N->getOperand(1),
955 CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
956 } else {
957 unsigned Opc = N->getOpcode() == ISD::UADDO ? AMDGPU::S_UADDO_PSEUDO
958 : AMDGPU::S_USUBO_PSEUDO;
959
960 CurDAG->SelectNodeTo(N, Opc, N->getVTList(),
961 {N->getOperand(0), N->getOperand(1)});
962 }
963}
964
965void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) {
966 SDLoc SL(N);
967 // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp, omod
968 SDValue Ops[10];
969
970 SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[6], Ops[7]);
971 SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
972 SelectVOP3Mods(N->getOperand(3), Ops[5], Ops[4]);
973 Ops[8] = N->getOperand(0);
974 Ops[9] = N->getOperand(4);
975
976 // If there are no source modifiers, prefer fmac over fma because it can use
977 // the smaller VOP2 encoding.
978 bool UseFMAC = Subtarget->hasDLInsts() &&
979 cast<ConstantSDNode>(Ops[0])->isZero() &&
980 cast<ConstantSDNode>(Ops[2])->isZero() &&
981 cast<ConstantSDNode>(Ops[4])->isZero();
982 unsigned Opcode = UseFMAC ? AMDGPU::V_FMAC_F32_e64 : AMDGPU::V_FMA_F32_e64;
983 CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), Ops);
984}
985
986void AMDGPUDAGToDAGISel::SelectFMUL_W_CHAIN(SDNode *N) {
987 SDLoc SL(N);
988 // src0_modifiers, src0, src1_modifiers, src1, clamp, omod
989 SDValue Ops[8];
990
991 SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[4], Ops[5]);
992 SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
993 Ops[6] = N->getOperand(0);
994 Ops[7] = N->getOperand(3);
995
996 CurDAG->SelectNodeTo(N, AMDGPU::V_MUL_F32_e64, N->getVTList(), Ops);
997}
998
999// We need to handle this here because tablegen doesn't support matching
1000// instructions with multiple outputs.
1001void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
1002 SDLoc SL(N);
1003 EVT VT = N->getValueType(0);
1004
1005 assert(VT == MVT::f32 || VT == MVT::f64);
1006
1007 unsigned Opc
1008 = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64_e64 : AMDGPU::V_DIV_SCALE_F32_e64;
1009
1010 // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp,
1011 // omod
1012 SDValue Ops[8];
1013 SelectVOP3BMods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]);
1014 SelectVOP3BMods(N->getOperand(1), Ops[3], Ops[2]);
1015 SelectVOP3BMods(N->getOperand(2), Ops[5], Ops[4]);
1016 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
1017}
1018
1019// We need to handle this here because tablegen doesn't support matching
1020// instructions with multiple outputs.
1021void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {
1022 SDLoc SL(N);
1023 bool Signed = N->getOpcode() == AMDGPUISD::MAD_I64_I32;
1024 unsigned Opc;
1025 if (Subtarget->hasMADIntraFwdBug())
1026 Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
1027 : AMDGPU::V_MAD_U64_U32_gfx11_e64;
1028 else
1029 Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
1030
1031 SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
1032 SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
1033 Clamp };
1034 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
1035}
1036
1037// We need to handle this here because tablegen doesn't support matching
1038// instructions with multiple outputs.
1039void AMDGPUDAGToDAGISel::SelectMUL_LOHI(SDNode *N) {
1040 SDLoc SL(N);
1041 bool Signed = N->getOpcode() == ISD::SMUL_LOHI;
1042 unsigned Opc;
1043 if (Subtarget->hasMADIntraFwdBug())
1044 Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
1045 : AMDGPU::V_MAD_U64_U32_gfx11_e64;
1046 else
1047 Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
1048
1049 SDValue Zero = CurDAG->getTargetConstant(0, SL, MVT::i64);
1050 SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
1051 SDValue Ops[] = {N->getOperand(0), N->getOperand(1), Zero, Clamp};
1052 SDNode *Mad = CurDAG->getMachineNode(Opc, SL, N->getVTList(), Ops);
1053 if (!SDValue(N, 0).use_empty()) {
1054 SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32);
1055 SDNode *Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL,
1056 MVT::i32, SDValue(Mad, 0), Sub0);
1057 ReplaceUses(SDValue(N, 0), SDValue(Lo, 0));
1058 }
1059 if (!SDValue(N, 1).use_empty()) {
1060 SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32);
1061 SDNode *Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL,
1062 MVT::i32, SDValue(Mad, 0), Sub1);
1063 ReplaceUses(SDValue(N, 1), SDValue(Hi, 0));
1064 }
1066}
1067
1068bool AMDGPUDAGToDAGISel::isDSOffsetLegal(SDValue Base, unsigned Offset) const {
1069 if (!isUInt<16>(Offset))
1070 return false;
1071
1072 if (!Base || Subtarget->hasUsableDSOffset() ||
1073 Subtarget->unsafeDSOffsetFoldingEnabled())
1074 return true;
1075
1076 // On Southern Islands instruction with a negative base value and an offset
1077 // don't seem to work.
1078 return CurDAG->SignBitIsZero(Base);
1079}
1080
1081bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
1082 SDValue &Offset) const {
1083 SDLoc DL(Addr);
1085 SDValue N0 = Addr.getOperand(0);
1086 SDValue N1 = Addr.getOperand(1);
1087 ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1088 if (isDSOffsetLegal(N0, C1->getSExtValue())) {
1089 // (add n0, c0)
1090 Base = N0;
1091 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
1092 return true;
1093 }
1094 } else if (Addr.getOpcode() == ISD::SUB) {
1095 // sub C, x -> add (sub 0, x), C
1096 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
1097 int64_t ByteOffset = C->getSExtValue();
1098 if (isDSOffsetLegal(SDValue(), ByteOffset)) {
1099 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1100
1101 // XXX - This is kind of hacky. Create a dummy sub node so we can check
1102 // the known bits in isDSOffsetLegal. We need to emit the selected node
1103 // here, so this is thrown away.
1104 SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32,
1105 Zero, Addr.getOperand(1));
1106
1107 if (isDSOffsetLegal(Sub, ByteOffset)) {
1109 Opnds.push_back(Zero);
1110 Opnds.push_back(Addr.getOperand(1));
1111
1112 // FIXME: Select to VOP3 version for with-carry.
1113 unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1114 if (Subtarget->hasAddNoCarry()) {
1115 SubOp = AMDGPU::V_SUB_U32_e64;
1116 Opnds.push_back(
1117 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
1118 }
1119
1120 MachineSDNode *MachineSub =
1121 CurDAG->getMachineNode(SubOp, DL, MVT::i32, Opnds);
1122
1123 Base = SDValue(MachineSub, 0);
1124 Offset = CurDAG->getTargetConstant(ByteOffset, DL, MVT::i16);
1125 return true;
1126 }
1127 }
1128 }
1129 } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1130 // If we have a constant address, prefer to put the constant into the
1131 // offset. This can save moves to load the constant address since multiple
1132 // operations can share the zero base address register, and enables merging
1133 // into read2 / write2 instructions.
1134
1135 SDLoc DL(Addr);
1136
1137 if (isDSOffsetLegal(SDValue(), CAddr->getZExtValue())) {
1138 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1139 MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
1140 DL, MVT::i32, Zero);
1141 Base = SDValue(MovZero, 0);
1142 Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16);
1143 return true;
1144 }
1145 }
1146
1147 // default case
1148 Base = Addr;
1149 Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i16);
1150 return true;
1151}
1152
1153bool AMDGPUDAGToDAGISel::isDSOffset2Legal(SDValue Base, unsigned Offset0,
1154 unsigned Offset1,
1155 unsigned Size) const {
1156 if (Offset0 % Size != 0 || Offset1 % Size != 0)
1157 return false;
1158 if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
1159 return false;
1160
1161 if (!Base || Subtarget->hasUsableDSOffset() ||
1162 Subtarget->unsafeDSOffsetFoldingEnabled())
1163 return true;
1164
1165 // On Southern Islands instruction with a negative base value and an offset
1166 // don't seem to work.
1167 return CurDAG->SignBitIsZero(Base);
1168}
1169
1170// Return whether the operation has NoUnsignedWrap property.
1172 return (Addr.getOpcode() == ISD::ADD &&
1173 Addr->getFlags().hasNoUnsignedWrap()) ||
1174 Addr->getOpcode() == ISD::OR;
1175}
1176
1177// Check that the base address of flat scratch load/store in the form of `base +
1178// offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware
1179// requirement). We always treat the first operand as the base address here.
1180bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue Addr) const {
1182 return true;
1183
1184 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1185 // values.
1186 if (Subtarget->hasSignedScratchOffsets())
1187 return true;
1188
1189 auto LHS = Addr.getOperand(0);
1190 auto RHS = Addr.getOperand(1);
1191
1192 // If the immediate offset is negative and within certain range, the base
1193 // address cannot also be negative. If the base is also negative, the sum
1194 // would be either negative or much larger than the valid range of scratch
1195 // memory a thread can access.
1196 ConstantSDNode *ImmOp = nullptr;
1197 if (Addr.getOpcode() == ISD::ADD && (ImmOp = dyn_cast<ConstantSDNode>(RHS))) {
1198 if (ImmOp->getSExtValue() < 0 && ImmOp->getSExtValue() > -0x40000000)
1199 return true;
1200 }
1201
1202 return CurDAG->SignBitIsZero(LHS);
1203}
1204
1205// Check address value in SGPR/VGPR are legal for flat scratch in the form
1206// of: SGPR + VGPR.
1207bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSV(SDValue Addr) const {
1209 return true;
1210
1211 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1212 // values.
1213 if (Subtarget->hasSignedScratchOffsets())
1214 return true;
1215
1216 auto LHS = Addr.getOperand(0);
1217 auto RHS = Addr.getOperand(1);
1218 return CurDAG->SignBitIsZero(RHS) && CurDAG->SignBitIsZero(LHS);
1219}
1220
1221// Check address value in SGPR/VGPR are legal for flat scratch in the form
1222// of: SGPR + VGPR + Imm.
1223bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSVImm(SDValue Addr) const {
1224 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1225 // values.
1226 if (AMDGPU::isGFX12Plus(*Subtarget))
1227 return true;
1228
1229 auto Base = Addr.getOperand(0);
1230 auto *RHSImm = cast<ConstantSDNode>(Addr.getOperand(1));
1231 // If the immediate offset is negative and within certain range, the base
1232 // address cannot also be negative. If the base is also negative, the sum
1233 // would be either negative or much larger than the valid range of scratch
1234 // memory a thread can access.
1235 if (isNoUnsignedWrap(Base) &&
1237 (RHSImm->getSExtValue() < 0 && RHSImm->getSExtValue() > -0x40000000)))
1238 return true;
1239
1240 auto LHS = Base.getOperand(0);
1241 auto RHS = Base.getOperand(1);
1242 return CurDAG->SignBitIsZero(RHS) && CurDAG->SignBitIsZero(LHS);
1243}
1244
1245// TODO: If offset is too big, put low 16-bit into offset.
1246bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,
1247 SDValue &Offset0,
1248 SDValue &Offset1) const {
1249 return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 4);
1250}
1251
1252bool AMDGPUDAGToDAGISel::SelectDS128Bit8ByteAligned(SDValue Addr, SDValue &Base,
1253 SDValue &Offset0,
1254 SDValue &Offset1) const {
1255 return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 8);
1256}
1257
1258bool AMDGPUDAGToDAGISel::SelectDSReadWrite2(SDValue Addr, SDValue &Base,
1259 SDValue &Offset0, SDValue &Offset1,
1260 unsigned Size) const {
1261 SDLoc DL(Addr);
1262
1264 SDValue N0 = Addr.getOperand(0);
1265 SDValue N1 = Addr.getOperand(1);
1266 ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1267 unsigned OffsetValue0 = C1->getZExtValue();
1268 unsigned OffsetValue1 = OffsetValue0 + Size;
1269
1270 // (add n0, c0)
1271 if (isDSOffset2Legal(N0, OffsetValue0, OffsetValue1, Size)) {
1272 Base = N0;
1273 Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i8);
1274 Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i8);
1275 return true;
1276 }
1277 } else if (Addr.getOpcode() == ISD::SUB) {
1278 // sub C, x -> add (sub 0, x), C
1279 if (const ConstantSDNode *C =
1280 dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
1281 unsigned OffsetValue0 = C->getZExtValue();
1282 unsigned OffsetValue1 = OffsetValue0 + Size;
1283
1284 if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {
1285 SDLoc DL(Addr);
1286 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1287
1288 // XXX - This is kind of hacky. Create a dummy sub node so we can check
1289 // the known bits in isDSOffsetLegal. We need to emit the selected node
1290 // here, so this is thrown away.
1291 SDValue Sub =
1292 CurDAG->getNode(ISD::SUB, DL, MVT::i32, Zero, Addr.getOperand(1));
1293
1294 if (isDSOffset2Legal(Sub, OffsetValue0, OffsetValue1, Size)) {
1296 Opnds.push_back(Zero);
1297 Opnds.push_back(Addr.getOperand(1));
1298 unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1299 if (Subtarget->hasAddNoCarry()) {
1300 SubOp = AMDGPU::V_SUB_U32_e64;
1301 Opnds.push_back(
1302 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
1303 }
1304
1305 MachineSDNode *MachineSub = CurDAG->getMachineNode(
1306 SubOp, DL, MVT::getIntegerVT(Size * 8), Opnds);
1307
1308 Base = SDValue(MachineSub, 0);
1309 Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i8);
1310 Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i8);
1311 return true;
1312 }
1313 }
1314 }
1315 } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1316 unsigned OffsetValue0 = CAddr->getZExtValue();
1317 unsigned OffsetValue1 = OffsetValue0 + Size;
1318
1319 if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {
1320 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1321 MachineSDNode *MovZero =
1322 CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32, Zero);
1323 Base = SDValue(MovZero, 0);
1324 Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i8);
1325 Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i8);
1326 return true;
1327 }
1328 }
1329
1330 // default case
1331
1332 Base = Addr;
1333 Offset0 = CurDAG->getTargetConstant(0, DL, MVT::i8);
1334 Offset1 = CurDAG->getTargetConstant(1, DL, MVT::i8);
1335 return true;
1336}
1337
1338bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDValue &VAddr,
1339 SDValue &SOffset, SDValue &Offset,
1340 SDValue &Offen, SDValue &Idxen,
1341 SDValue &Addr64) const {
1342 // Subtarget prefers to use flat instruction
1343 // FIXME: This should be a pattern predicate and not reach here
1344 if (Subtarget->useFlatForGlobal())
1345 return false;
1346
1347 SDLoc DL(Addr);
1348
1349 Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1);
1350 Offen = CurDAG->getTargetConstant(0, DL, MVT::i1);
1351 Addr64 = CurDAG->getTargetConstant(0, DL, MVT::i1);
1352 SOffset = Subtarget->hasRestrictedSOffset()
1353 ? CurDAG->getRegister(AMDGPU::SGPR_NULL, MVT::i32)
1354 : CurDAG->getTargetConstant(0, DL, MVT::i32);
1355
1356 ConstantSDNode *C1 = nullptr;
1357 SDValue N0 = Addr;
1359 C1 = cast<ConstantSDNode>(Addr.getOperand(1));
1360 if (isUInt<32>(C1->getZExtValue()))
1361 N0 = Addr.getOperand(0);
1362 else
1363 C1 = nullptr;
1364 }
1365
1366 if (N0.getOpcode() == ISD::ADD) {
1367 // (add N2, N3) -> addr64, or
1368 // (add (add N2, N3), C1) -> addr64
1369 SDValue N2 = N0.getOperand(0);
1370 SDValue N3 = N0.getOperand(1);
1371 Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
1372
1373 if (N2->isDivergent()) {
1374 if (N3->isDivergent()) {
1375 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
1376 // addr64, and construct the resource from a 0 address.
1377 Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
1378 VAddr = N0;
1379 } else {
1380 // N2 is divergent, N3 is not.
1381 Ptr = N3;
1382 VAddr = N2;
1383 }
1384 } else {
1385 // N2 is not divergent.
1386 Ptr = N2;
1387 VAddr = N3;
1388 }
1389 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1390 } else if (N0->isDivergent()) {
1391 // N0 is divergent. Use it as the addr64, and construct the resource from a
1392 // 0 address.
1393 Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
1394 VAddr = N0;
1395 Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
1396 } else {
1397 // N0 -> offset, or
1398 // (N0 + C1) -> offset
1399 VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32);
1400 Ptr = N0;
1401 }
1402
1403 if (!C1) {
1404 // No offset.
1405 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1406 return true;
1407 }
1408
1409 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1410 if (TII->isLegalMUBUFImmOffset(C1->getZExtValue())) {
1411 // Legal offset for instruction.
1412 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
1413 return true;
1414 }
1415
1416 // Illegal offset, store it in soffset.
1417 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1418 SOffset =
1420 AMDGPU::S_MOV_B32, DL, MVT::i32,
1421 CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32)),
1422 0);
1423 return true;
1424}
1425
1426bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
1427 SDValue &VAddr, SDValue &SOffset,
1428 SDValue &Offset) const {
1429 SDValue Ptr, Offen, Idxen, Addr64;
1430
1431 // addr64 bit was removed for volcanic islands.
1432 // FIXME: This should be a pattern predicate and not reach here
1433 if (!Subtarget->hasAddr64())
1434 return false;
1435
1436 if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
1437 return false;
1438
1439 ConstantSDNode *C = cast<ConstantSDNode>(Addr64);
1440 if (C->getSExtValue()) {
1441 SDLoc DL(Addr);
1442
1443 const SITargetLowering& Lowering =
1444 *static_cast<const SITargetLowering*>(getTargetLowering());
1445
1446 SRsrc = SDValue(Lowering.wrapAddr64Rsrc(*CurDAG, DL, Ptr), 0);
1447 return true;
1448 }
1449
1450 return false;
1451}
1452
1453std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const {
1454 SDLoc DL(N);
1455
1456 auto *FI = dyn_cast<FrameIndexSDNode>(N);
1457 SDValue TFI =
1458 FI ? CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0)) : N;
1459
1460 // We rebase the base address into an absolute stack address and hence
1461 // use constant 0 for soffset. This value must be retained until
1462 // frame elimination and eliminateFrameIndex will choose the appropriate
1463 // frame register if need be.
1464 return std::pair(TFI, CurDAG->getTargetConstant(0, DL, MVT::i32));
1465}
1466
1467bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent,
1468 SDValue Addr, SDValue &Rsrc,
1469 SDValue &VAddr, SDValue &SOffset,
1470 SDValue &ImmOffset) const {
1471
1472 SDLoc DL(Addr);
1475
1476 Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1477
1478 if (ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1479 int64_t Imm = CAddr->getSExtValue();
1480 const int64_t NullPtr =
1482 // Don't fold null pointer.
1483 if (Imm != NullPtr) {
1484 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
1485 SDValue HighBits =
1486 CurDAG->getTargetConstant(Imm & ~MaxOffset, DL, MVT::i32);
1487 MachineSDNode *MovHighBits = CurDAG->getMachineNode(
1488 AMDGPU::V_MOV_B32_e32, DL, MVT::i32, HighBits);
1489 VAddr = SDValue(MovHighBits, 0);
1490
1491 SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1492 ImmOffset = CurDAG->getTargetConstant(Imm & MaxOffset, DL, MVT::i32);
1493 return true;
1494 }
1495 }
1496
1498 // (add n0, c1)
1499
1500 SDValue N0 = Addr.getOperand(0);
1501 uint64_t C1 = Addr.getConstantOperandVal(1);
1502
1503 // Offsets in vaddr must be positive if range checking is enabled.
1504 //
1505 // The total computation of vaddr + soffset + offset must not overflow. If
1506 // vaddr is negative, even if offset is 0 the sgpr offset add will end up
1507 // overflowing.
1508 //
1509 // Prior to gfx9, MUBUF instructions with the vaddr offset enabled would
1510 // always perform a range check. If a negative vaddr base index was used,
1511 // this would fail the range check. The overall address computation would
1512 // compute a valid address, but this doesn't happen due to the range
1513 // check. For out-of-bounds MUBUF loads, a 0 is returned.
1514 //
1515 // Therefore it should be safe to fold any VGPR offset on gfx9 into the
1516 // MUBUF vaddr, but not on older subtargets which can only do this if the
1517 // sign bit is known 0.
1518 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1519 if (TII->isLegalMUBUFImmOffset(C1) &&
1520 (!Subtarget->privateMemoryResourceIsRangeChecked() ||
1521 CurDAG->SignBitIsZero(N0))) {
1522 std::tie(VAddr, SOffset) = foldFrameIndex(N0);
1523 ImmOffset = CurDAG->getTargetConstant(C1, DL, MVT::i32);
1524 return true;
1525 }
1526 }
1527
1528 // (node)
1529 std::tie(VAddr, SOffset) = foldFrameIndex(Addr);
1530 ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1531 return true;
1532}
1533
1534static bool IsCopyFromSGPR(const SIRegisterInfo &TRI, SDValue Val) {
1535 if (Val.getOpcode() != ISD::CopyFromReg)
1536 return false;
1537 auto Reg = cast<RegisterSDNode>(Val.getOperand(1))->getReg();
1538 if (!Reg.isPhysical())
1539 return false;
1540 auto RC = TRI.getPhysRegBaseClass(Reg);
1541 return RC && TRI.isSGPRClass(RC);
1542}
1543
1544bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent,
1545 SDValue Addr,
1546 SDValue &SRsrc,
1547 SDValue &SOffset,
1548 SDValue &Offset) const {
1549 const SIRegisterInfo *TRI =
1550 static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
1551 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1554 SDLoc DL(Addr);
1555
1556 // CopyFromReg <sgpr>
1557 if (IsCopyFromSGPR(*TRI, Addr)) {
1558 SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1559 SOffset = Addr;
1560 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1561 return true;
1562 }
1563
1564 ConstantSDNode *CAddr;
1565 if (Addr.getOpcode() == ISD::ADD) {
1566 // Add (CopyFromReg <sgpr>) <constant>
1567 CAddr = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
1568 if (!CAddr || !TII->isLegalMUBUFImmOffset(CAddr->getZExtValue()))
1569 return false;
1570 if (!IsCopyFromSGPR(*TRI, Addr.getOperand(0)))
1571 return false;
1572
1573 SOffset = Addr.getOperand(0);
1574 } else if ((CAddr = dyn_cast<ConstantSDNode>(Addr)) &&
1575 TII->isLegalMUBUFImmOffset(CAddr->getZExtValue())) {
1576 // <constant>
1577 SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1578 } else {
1579 return false;
1580 }
1581
1582 SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1583
1584 Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i32);
1585 return true;
1586}
1587
1588bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
1589 SDValue &SOffset, SDValue &Offset
1590 ) const {
1591 SDValue Ptr, VAddr, Offen, Idxen, Addr64;
1592 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1593
1594 if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
1595 return false;
1596
1597 if (!cast<ConstantSDNode>(Offen)->getSExtValue() &&
1598 !cast<ConstantSDNode>(Idxen)->getSExtValue() &&
1599 !cast<ConstantSDNode>(Addr64)->getSExtValue()) {
1600 uint64_t Rsrc = TII->getDefaultRsrcDataFormat() |
1601 APInt::getAllOnes(32).getZExtValue(); // Size
1602 SDLoc DL(Addr);
1603
1604 const SITargetLowering& Lowering =
1605 *static_cast<const SITargetLowering*>(getTargetLowering());
1606
1607 SRsrc = SDValue(Lowering.buildRSRC(*CurDAG, DL, Ptr, 0, Rsrc), 0);
1608 return true;
1609 }
1610 return false;
1611}
1612
1613bool AMDGPUDAGToDAGISel::SelectBUFSOffset(SDValue ByteOffsetNode,
1614 SDValue &SOffset) const {
1615 if (Subtarget->hasRestrictedSOffset() && isNullConstant(ByteOffsetNode)) {
1616 SOffset = CurDAG->getRegister(AMDGPU::SGPR_NULL, MVT::i32);
1617 return true;
1618 }
1619
1620 SOffset = ByteOffsetNode;
1621 return true;
1622}
1623
1624// Find a load or store from corresponding pattern root.
1625// Roots may be build_vector, bitconvert or their combinations.
1628 if (MemSDNode *MN = dyn_cast<MemSDNode>(N))
1629 return MN;
1630 assert(isa<BuildVectorSDNode>(N));
1631 for (SDValue V : N->op_values())
1632 if (MemSDNode *MN =
1633 dyn_cast<MemSDNode>(AMDGPUTargetLowering::stripBitcast(V)))
1634 return MN;
1635 llvm_unreachable("cannot find MemSDNode in the pattern!");
1636}
1637
1638bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr,
1639 SDValue &VAddr, SDValue &Offset,
1640 uint64_t FlatVariant) const {
1641 int64_t OffsetVal = 0;
1642
1643 unsigned AS = findMemSDNode(N)->getAddressSpace();
1644
1645 bool CanHaveFlatSegmentOffsetBug =
1646 Subtarget->hasFlatSegmentOffsetBug() &&
1647 FlatVariant == SIInstrFlags::FLAT &&
1649
1650 if (Subtarget->hasFlatInstOffsets() && !CanHaveFlatSegmentOffsetBug) {
1651 SDValue N0, N1;
1652 if (isBaseWithConstantOffset64(Addr, N0, N1) &&
1653 (FlatVariant != SIInstrFlags::FlatScratch ||
1654 isFlatScratchBaseLegal(Addr))) {
1655 int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
1656
1657 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1658 if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) {
1659 Addr = N0;
1660 OffsetVal = COffsetVal;
1661 } else {
1662 // If the offset doesn't fit, put the low bits into the offset field and
1663 // add the rest.
1664 //
1665 // For a FLAT instruction the hardware decides whether to access
1666 // global/scratch/shared memory based on the high bits of vaddr,
1667 // ignoring the offset field, so we have to ensure that when we add
1668 // remainder to vaddr it still points into the same underlying object.
1669 // The easiest way to do that is to make sure that we split the offset
1670 // into two pieces that are both >= 0 or both <= 0.
1671
1672 SDLoc DL(N);
1673 uint64_t RemainderOffset;
1674
1675 std::tie(OffsetVal, RemainderOffset) =
1676 TII->splitFlatOffset(COffsetVal, AS, FlatVariant);
1677
1678 SDValue AddOffsetLo =
1679 getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
1680 SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
1681
1682 if (Addr.getValueType().getSizeInBits() == 32) {
1684 Opnds.push_back(N0);
1685 Opnds.push_back(AddOffsetLo);
1686 unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32;
1687 if (Subtarget->hasAddNoCarry()) {
1688 AddOp = AMDGPU::V_ADD_U32_e64;
1689 Opnds.push_back(Clamp);
1690 }
1691 Addr = SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0);
1692 } else {
1693 // TODO: Should this try to use a scalar add pseudo if the base address
1694 // is uniform and saddr is usable?
1695 SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
1696 SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
1697
1698 SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1699 DL, MVT::i32, N0, Sub0);
1700 SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1701 DL, MVT::i32, N0, Sub1);
1702
1703 SDValue AddOffsetHi =
1704 getMaterializedScalarImm32(Hi_32(RemainderOffset), DL);
1705
1706 SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1);
1707
1708 SDNode *Add =
1709 CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs,
1710 {AddOffsetLo, SDValue(N0Lo, 0), Clamp});
1711
1712 SDNode *Addc = CurDAG->getMachineNode(
1713 AMDGPU::V_ADDC_U32_e64, DL, VTs,
1714 {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp});
1715
1716 SDValue RegSequenceArgs[] = {
1717 CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32),
1718 SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1};
1719
1720 Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
1721 MVT::i64, RegSequenceArgs),
1722 0);
1723 }
1724 }
1725 }
1726 }
1727
1728 VAddr = Addr;
1729 Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i32);
1730 return true;
1731}
1732
1733bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N, SDValue Addr,
1734 SDValue &VAddr,
1735 SDValue &Offset) const {
1736 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, SIInstrFlags::FLAT);
1737}
1738
1739bool AMDGPUDAGToDAGISel::SelectGlobalOffset(SDNode *N, SDValue Addr,
1740 SDValue &VAddr,
1741 SDValue &Offset) const {
1742 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, SIInstrFlags::FlatGlobal);
1743}
1744
1745bool AMDGPUDAGToDAGISel::SelectScratchOffset(SDNode *N, SDValue Addr,
1746 SDValue &VAddr,
1747 SDValue &Offset) const {
1748 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset,
1750}
1751
1752// If this matches zero_extend i32:x, return x
1754 if (Op.getOpcode() != ISD::ZERO_EXTEND)
1755 return SDValue();
1756
1757 SDValue ExtSrc = Op.getOperand(0);
1758 return (ExtSrc.getValueType() == MVT::i32) ? ExtSrc : SDValue();
1759}
1760
1761// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
1762bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
1763 SDValue Addr,
1764 SDValue &SAddr,
1765 SDValue &VOffset,
1766 SDValue &Offset) const {
1767 int64_t ImmOffset = 0;
1768
1769 // Match the immediate offset first, which canonically is moved as low as
1770 // possible.
1771
1772 SDValue LHS, RHS;
1773 if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
1774 int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
1775 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1776
1777 if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS,
1779 Addr = LHS;
1780 ImmOffset = COffsetVal;
1781 } else if (!LHS->isDivergent()) {
1782 if (COffsetVal > 0) {
1783 SDLoc SL(N);
1784 // saddr + large_offset -> saddr +
1785 // (voffset = large_offset & ~MaxOffset) +
1786 // (large_offset & MaxOffset);
1787 int64_t SplitImmOffset, RemainderOffset;
1788 std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
1790
1791 if (isUInt<32>(RemainderOffset)) {
1792 SDNode *VMov = CurDAG->getMachineNode(
1793 AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
1794 CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
1795 VOffset = SDValue(VMov, 0);
1796 SAddr = LHS;
1797 Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i32);
1798 return true;
1799 }
1800 }
1801
1802 // We are adding a 64 bit SGPR and a constant. If constant bus limit
1803 // is 1 we would need to perform 1 or 2 extra moves for each half of
1804 // the constant and it is better to do a scalar add and then issue a
1805 // single VALU instruction to materialize zero. Otherwise it is less
1806 // instructions to perform VALU adds with immediates or inline literals.
1807 unsigned NumLiterals =
1808 !TII->isInlineConstant(APInt(32, COffsetVal & 0xffffffff)) +
1809 !TII->isInlineConstant(APInt(32, COffsetVal >> 32));
1810 if (Subtarget->getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
1811 return false;
1812 }
1813 }
1814
1815 // Match the variable offset.
1816 if (Addr.getOpcode() == ISD::ADD) {
1817 LHS = Addr.getOperand(0);
1818 RHS = Addr.getOperand(1);
1819
1820 if (!LHS->isDivergent()) {
1821 // add (i64 sgpr), (zero_extend (i32 vgpr))
1822 if (SDValue ZextRHS = matchZExtFromI32(RHS)) {
1823 SAddr = LHS;
1824 VOffset = ZextRHS;
1825 }
1826 }
1827
1828 if (!SAddr && !RHS->isDivergent()) {
1829 // add (zero_extend (i32 vgpr)), (i64 sgpr)
1830 if (SDValue ZextLHS = matchZExtFromI32(LHS)) {
1831 SAddr = RHS;
1832 VOffset = ZextLHS;
1833 }
1834 }
1835
1836 if (SAddr) {
1837 Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i32);
1838 return true;
1839 }
1840 }
1841
1842 if (Addr->isDivergent() || Addr.getOpcode() == ISD::UNDEF ||
1843 isa<ConstantSDNode>(Addr))
1844 return false;
1845
1846 // It's cheaper to materialize a single 32-bit zero for vaddr than the two
1847 // moves required to copy a 64-bit SGPR to VGPR.
1848 SAddr = Addr;
1849 SDNode *VMov =
1850 CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, SDLoc(Addr), MVT::i32,
1851 CurDAG->getTargetConstant(0, SDLoc(), MVT::i32));
1852 VOffset = SDValue(VMov, 0);
1853 Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i32);
1854 return true;
1855}
1856
1858 if (auto FI = dyn_cast<FrameIndexSDNode>(SAddr)) {
1859 SAddr = CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0));
1860 } else if (SAddr.getOpcode() == ISD::ADD &&
1861 isa<FrameIndexSDNode>(SAddr.getOperand(0))) {
1862 // Materialize this into a scalar move for scalar address to avoid
1863 // readfirstlane.
1864 auto FI = cast<FrameIndexSDNode>(SAddr.getOperand(0));
1865 SDValue TFI = CurDAG->getTargetFrameIndex(FI->getIndex(),
1866 FI->getValueType(0));
1867 SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, SDLoc(SAddr),
1868 MVT::i32, TFI, SAddr.getOperand(1)),
1869 0);
1870 }
1871
1872 return SAddr;
1873}
1874
1875// Match (32-bit SGPR base) + sext(imm offset)
1876bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr,
1877 SDValue &SAddr,
1878 SDValue &Offset) const {
1879 if (Addr->isDivergent())
1880 return false;
1881
1882 SDLoc DL(Addr);
1883
1884 int64_t COffsetVal = 0;
1885
1886 if (CurDAG->isBaseWithConstantOffset(Addr) && isFlatScratchBaseLegal(Addr)) {
1887 COffsetVal = cast<ConstantSDNode>(Addr.getOperand(1))->getSExtValue();
1888 SAddr = Addr.getOperand(0);
1889 } else {
1890 SAddr = Addr;
1891 }
1892
1893 SAddr = SelectSAddrFI(CurDAG, SAddr);
1894
1895 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1896
1897 if (!TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS,
1899 int64_t SplitImmOffset, RemainderOffset;
1900 std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
1902
1903 COffsetVal = SplitImmOffset;
1904
1905 SDValue AddOffset =
1907 ? getMaterializedScalarImm32(Lo_32(RemainderOffset), DL)
1908 : CurDAG->getTargetConstant(RemainderOffset, DL, MVT::i32);
1909 SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, DL, MVT::i32,
1910 SAddr, AddOffset),
1911 0);
1912 }
1913
1914 Offset = CurDAG->getTargetConstant(COffsetVal, DL, MVT::i16);
1915
1916 return true;
1917}
1918
1919// Check whether the flat scratch SVS swizzle bug affects this access.
1920bool AMDGPUDAGToDAGISel::checkFlatScratchSVSSwizzleBug(
1921 SDValue VAddr, SDValue SAddr, uint64_t ImmOffset) const {
1922 if (!Subtarget->hasFlatScratchSVSSwizzleBug())
1923 return false;
1924
1925 // The bug affects the swizzling of SVS accesses if there is any carry out
1926 // from the two low order bits (i.e. from bit 1 into bit 2) when adding
1927 // voffset to (soffset + inst_offset).
1928 KnownBits VKnown = CurDAG->computeKnownBits(VAddr);
1930 /*Add=*/true, /*NSW=*/false, /*NUW=*/false,
1931 CurDAG->computeKnownBits(SAddr),
1932 KnownBits::makeConstant(APInt(32, ImmOffset)));
1933 uint64_t VMax = VKnown.getMaxValue().getZExtValue();
1935 return (VMax & 3) + (SMax & 3) >= 4;
1936}
1937
1938bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
1939 SDValue &VAddr, SDValue &SAddr,
1940 SDValue &Offset) const {
1941 int64_t ImmOffset = 0;
1942
1943 SDValue LHS, RHS;
1944 SDValue OrigAddr = Addr;
1945 if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
1946 int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
1947 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1948
1949 if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, true)) {
1950 Addr = LHS;
1951 ImmOffset = COffsetVal;
1952 } else if (!LHS->isDivergent() && COffsetVal > 0) {
1953 SDLoc SL(N);
1954 // saddr + large_offset -> saddr + (vaddr = large_offset & ~MaxOffset) +
1955 // (large_offset & MaxOffset);
1956 int64_t SplitImmOffset, RemainderOffset;
1957 std::tie(SplitImmOffset, RemainderOffset)
1958 = TII->splitFlatOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, true);
1959
1960 if (isUInt<32>(RemainderOffset)) {
1961 SDNode *VMov = CurDAG->getMachineNode(
1962 AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
1963 CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
1964 VAddr = SDValue(VMov, 0);
1965 SAddr = LHS;
1966 if (!isFlatScratchBaseLegal(Addr))
1967 return false;
1968 if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, SplitImmOffset))
1969 return false;
1970 Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i16);
1971 return true;
1972 }
1973 }
1974 }
1975
1976 if (Addr.getOpcode() != ISD::ADD)
1977 return false;
1978
1979 LHS = Addr.getOperand(0);
1980 RHS = Addr.getOperand(1);
1981
1982 if (!LHS->isDivergent() && RHS->isDivergent()) {
1983 SAddr = LHS;
1984 VAddr = RHS;
1985 } else if (!RHS->isDivergent() && LHS->isDivergent()) {
1986 SAddr = RHS;
1987 VAddr = LHS;
1988 } else {
1989 return false;
1990 }
1991
1992 if (OrigAddr != Addr) {
1993 if (!isFlatScratchBaseLegalSVImm(OrigAddr))
1994 return false;
1995 } else {
1996 if (!isFlatScratchBaseLegalSV(OrigAddr))
1997 return false;
1998 }
1999
2000 if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, ImmOffset))
2001 return false;
2002 SAddr = SelectSAddrFI(CurDAG, SAddr);
2003 Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16);
2004 return true;
2005}
2006
2007// For unbuffered smem loads, it is illegal for the Immediate Offset to be
2008// negative if the resulting (Offset + (M0 or SOffset or zero) is negative.
2009// Handle the case where the Immediate Offset + SOffset is negative.
2010bool AMDGPUDAGToDAGISel::isSOffsetLegalWithImmOffset(SDValue *SOffset,
2011 bool Imm32Only,
2012 bool IsBuffer,
2013 int64_t ImmOffset) const {
2014 if (!IsBuffer && !Imm32Only && ImmOffset < 0 &&
2015 AMDGPU::hasSMRDSignedImmOffset(*Subtarget)) {
2016 KnownBits SKnown = CurDAG->computeKnownBits(*SOffset);
2017 if (ImmOffset + SKnown.getMinValue().getSExtValue() < 0)
2018 return false;
2019 }
2020
2021 return true;
2022}
2023
2024// Match an immediate (if Offset is not null) or an SGPR (if SOffset is
2025// not null) offset. If Imm32Only is true, match only 32-bit immediate
2026// offsets available on CI.
2027bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
2028 SDValue *SOffset, SDValue *Offset,
2029 bool Imm32Only, bool IsBuffer,
2030 bool HasSOffset,
2031 int64_t ImmOffset) const {
2032 assert((!SOffset || !Offset) &&
2033 "Cannot match both soffset and offset at the same time!");
2034
2035 ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode);
2036 if (!C) {
2037 if (!SOffset)
2038 return false;
2039
2040 if (ByteOffsetNode.getValueType().isScalarInteger() &&
2041 ByteOffsetNode.getValueType().getSizeInBits() == 32) {
2042 *SOffset = ByteOffsetNode;
2043 return isSOffsetLegalWithImmOffset(SOffset, Imm32Only, IsBuffer,
2044 ImmOffset);
2045 }
2046 if (ByteOffsetNode.getOpcode() == ISD::ZERO_EXTEND) {
2047 if (ByteOffsetNode.getOperand(0).getValueType().getSizeInBits() == 32) {
2048 *SOffset = ByteOffsetNode.getOperand(0);
2049 return isSOffsetLegalWithImmOffset(SOffset, Imm32Only, IsBuffer,
2050 ImmOffset);
2051 }
2052 }
2053 return false;
2054 }
2055
2056 SDLoc SL(ByteOffsetNode);
2057
2058 // GFX9 and GFX10 have signed byte immediate offsets. The immediate
2059 // offset for S_BUFFER instructions is unsigned.
2060 int64_t ByteOffset = IsBuffer ? C->getZExtValue() : C->getSExtValue();
2061 std::optional<int64_t> EncodedOffset = AMDGPU::getSMRDEncodedOffset(
2062 *Subtarget, ByteOffset, IsBuffer, HasSOffset);
2063 if (EncodedOffset && Offset && !Imm32Only) {
2064 *Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);
2065 return true;
2066 }
2067
2068 // SGPR and literal offsets are unsigned.
2069 if (ByteOffset < 0)
2070 return false;
2071
2072 EncodedOffset = AMDGPU::getSMRDEncodedLiteralOffset32(*Subtarget, ByteOffset);
2073 if (EncodedOffset && Offset && Imm32Only) {
2074 *Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);
2075 return true;
2076 }
2077
2078 if (!isUInt<32>(ByteOffset) && !isInt<32>(ByteOffset))
2079 return false;
2080
2081 if (SOffset) {
2082 SDValue C32Bit = CurDAG->getTargetConstant(ByteOffset, SL, MVT::i32);
2083 *SOffset = SDValue(
2084 CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, C32Bit), 0);
2085 return true;
2086 }
2087
2088 return false;
2089}
2090
2091SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const {
2092 if (Addr.getValueType() != MVT::i32)
2093 return Addr;
2094
2095 // Zero-extend a 32-bit address.
2096 SDLoc SL(Addr);
2097
2100 unsigned AddrHiVal = Info->get32BitAddressHighBits();
2101 SDValue AddrHi = CurDAG->getTargetConstant(AddrHiVal, SL, MVT::i32);
2102
2103 const SDValue Ops[] = {
2104 CurDAG->getTargetConstant(AMDGPU::SReg_64_XEXECRegClassID, SL, MVT::i32),
2105 Addr,
2106 CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
2107 SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, AddrHi),
2108 0),
2109 CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32),
2110 };
2111
2112 return SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, SL, MVT::i64,
2113 Ops), 0);
2114}
2115
2116// Match a base and an immediate (if Offset is not null) or an SGPR (if
2117// SOffset is not null) or an immediate+SGPR offset. If Imm32Only is
2118// true, match only 32-bit immediate offsets available on CI.
2119bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase,
2120 SDValue *SOffset, SDValue *Offset,
2121 bool Imm32Only, bool IsBuffer,
2122 bool HasSOffset,
2123 int64_t ImmOffset) const {
2124 if (SOffset && Offset) {
2125 assert(!Imm32Only && !IsBuffer);
2126 SDValue B;
2127
2128 if (!SelectSMRDBaseOffset(Addr, B, nullptr, Offset, false, false, true))
2129 return false;
2130
2131 int64_t ImmOff = 0;
2132 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(*Offset))
2133 ImmOff = C->getSExtValue();
2134
2135 return SelectSMRDBaseOffset(B, SBase, SOffset, nullptr, false, false, true,
2136 ImmOff);
2137 }
2138
2139 // A 32-bit (address + offset) should not cause unsigned 32-bit integer
2140 // wraparound, because s_load instructions perform the addition in 64 bits.
2141 if (Addr.getValueType() == MVT::i32 && Addr.getOpcode() == ISD::ADD &&
2142 !Addr->getFlags().hasNoUnsignedWrap())
2143 return false;
2144
2145 SDValue N0, N1;
2146 // Extract the base and offset if possible.
2147 if (CurDAG->isBaseWithConstantOffset(Addr) || Addr.getOpcode() == ISD::ADD) {
2148 N0 = Addr.getOperand(0);
2149 N1 = Addr.getOperand(1);
2150 } else if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, N0, N1)) {
2151 assert(N0 && N1 && isa<ConstantSDNode>(N1));
2152 }
2153 if (!N0 || !N1)
2154 return false;
2155
2156 if (SelectSMRDOffset(N1, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
2157 ImmOffset)) {
2158 SBase = N0;
2159 return true;
2160 }
2161 if (SelectSMRDOffset(N0, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
2162 ImmOffset)) {
2163 SBase = N1;
2164 return true;
2165 }
2166 return false;
2167}
2168
2169bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase,
2170 SDValue *SOffset, SDValue *Offset,
2171 bool Imm32Only) const {
2172 if (SelectSMRDBaseOffset(Addr, SBase, SOffset, Offset, Imm32Only)) {
2173 SBase = Expand32BitAddress(SBase);
2174 return true;
2175 }
2176
2177 if (Addr.getValueType() == MVT::i32 && Offset && !SOffset) {
2178 SBase = Expand32BitAddress(Addr);
2179 *Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
2180 return true;
2181 }
2182
2183 return false;
2184}
2185
2186bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr, SDValue &SBase,
2187 SDValue &Offset) const {
2188 return SelectSMRD(Addr, SBase, /* SOffset */ nullptr, &Offset);
2189}
2190
2191bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase,
2192 SDValue &Offset) const {
2194 return SelectSMRD(Addr, SBase, /* SOffset */ nullptr, &Offset,
2195 /* Imm32Only */ true);
2196}
2197
2198bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDValue Addr, SDValue &SBase,
2199 SDValue &SOffset) const {
2200 return SelectSMRD(Addr, SBase, &SOffset, /* Offset */ nullptr);
2201}
2202
2203bool AMDGPUDAGToDAGISel::SelectSMRDSgprImm(SDValue Addr, SDValue &SBase,
2204 SDValue &SOffset,
2205 SDValue &Offset) const {
2206 return SelectSMRD(Addr, SBase, &SOffset, &Offset);
2207}
2208
2209bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue N, SDValue &Offset) const {
2210 return SelectSMRDOffset(N, /* SOffset */ nullptr, &Offset,
2211 /* Imm32Only */ false, /* IsBuffer */ true);
2212}
2213
2214bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue N,
2215 SDValue &Offset) const {
2217 return SelectSMRDOffset(N, /* SOffset */ nullptr, &Offset,
2218 /* Imm32Only */ true, /* IsBuffer */ true);
2219}
2220
2221bool AMDGPUDAGToDAGISel::SelectSMRDBufferSgprImm(SDValue N, SDValue &SOffset,
2222 SDValue &Offset) const {
2223 // Match the (soffset + offset) pair as a 32-bit register base and
2224 // an immediate offset.
2225 return N.getValueType() == MVT::i32 &&
2226 SelectSMRDBaseOffset(N, /* SBase */ SOffset, /* SOffset*/ nullptr,
2227 &Offset, /* Imm32Only */ false,
2228 /* IsBuffer */ true);
2229}
2230
2231bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index,
2232 SDValue &Base,
2233 SDValue &Offset) const {
2234 SDLoc DL(Index);
2235
2237 SDValue N0 = Index.getOperand(0);
2238 SDValue N1 = Index.getOperand(1);
2239 ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
2240
2241 // (add n0, c0)
2242 // Don't peel off the offset (c0) if doing so could possibly lead
2243 // the base (n0) to be negative.
2244 // (or n0, |c0|) can never change a sign given isBaseWithConstantOffset.
2245 if (C1->getSExtValue() <= 0 || CurDAG->SignBitIsZero(N0) ||
2246 (Index->getOpcode() == ISD::OR && C1->getSExtValue() >= 0)) {
2247 Base = N0;
2248 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
2249 return true;
2250 }
2251 }
2252
2253 if (isa<ConstantSDNode>(Index))
2254 return false;
2255
2256 Base = Index;
2257 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
2258 return true;
2259}
2260
2261SDNode *AMDGPUDAGToDAGISel::getBFE32(bool IsSigned, const SDLoc &DL,
2262 SDValue Val, uint32_t Offset,
2263 uint32_t Width) {
2264 if (Val->isDivergent()) {
2265 unsigned Opcode = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2267 SDValue W = CurDAG->getTargetConstant(Width, DL, MVT::i32);
2268
2269 return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, Off, W);
2270 }
2271 unsigned Opcode = IsSigned ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2272 // Transformation function, pack the offset and width of a BFE into
2273 // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
2274 // source, bits [5:0] contain the offset and bits [22:16] the width.
2275 uint32_t PackedVal = Offset | (Width << 16);
2276 SDValue PackedConst = CurDAG->getTargetConstant(PackedVal, DL, MVT::i32);
2277
2278 return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, PackedConst);
2279}
2280
2281void AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode *N) {
2282 // "(a << b) srl c)" ---> "BFE_U32 a, (c-b), (32-c)
2283 // "(a << b) sra c)" ---> "BFE_I32 a, (c-b), (32-c)
2284 // Predicate: 0 < b <= c < 32
2285
2286 const SDValue &Shl = N->getOperand(0);
2287 ConstantSDNode *B = dyn_cast<ConstantSDNode>(Shl->getOperand(1));
2288 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
2289
2290 if (B && C) {
2291 uint32_t BVal = B->getZExtValue();
2292 uint32_t CVal = C->getZExtValue();
2293
2294 if (0 < BVal && BVal <= CVal && CVal < 32) {
2295 bool Signed = N->getOpcode() == ISD::SRA;
2296 ReplaceNode(N, getBFE32(Signed, SDLoc(N), Shl.getOperand(0), CVal - BVal,
2297 32 - CVal));
2298 return;
2299 }
2300 }
2301 SelectCode(N);
2302}
2303
2304void AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) {
2305 switch (N->getOpcode()) {
2306 case ISD::AND:
2307 if (N->getOperand(0).getOpcode() == ISD::SRL) {
2308 // "(a srl b) & mask" ---> "BFE_U32 a, b, popcount(mask)"
2309 // Predicate: isMask(mask)
2310 const SDValue &Srl = N->getOperand(0);
2311 ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(Srl.getOperand(1));
2312 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(N->getOperand(1));
2313
2314 if (Shift && Mask) {
2315 uint32_t ShiftVal = Shift->getZExtValue();
2316 uint32_t MaskVal = Mask->getZExtValue();
2317
2318 if (isMask_32(MaskVal)) {
2319 uint32_t WidthVal = llvm::popcount(MaskVal);
2320 ReplaceNode(N, getBFE32(false, SDLoc(N), Srl.getOperand(0), ShiftVal,
2321 WidthVal));
2322 return;
2323 }
2324 }
2325 }
2326 break;
2327 case ISD::SRL:
2328 if (N->getOperand(0).getOpcode() == ISD::AND) {
2329 // "(a & mask) srl b)" ---> "BFE_U32 a, b, popcount(mask >> b)"
2330 // Predicate: isMask(mask >> b)
2331 const SDValue &And = N->getOperand(0);
2332 ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(N->getOperand(1));
2333 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(And->getOperand(1));
2334
2335 if (Shift && Mask) {
2336 uint32_t ShiftVal = Shift->getZExtValue();
2337 uint32_t MaskVal = Mask->getZExtValue() >> ShiftVal;
2338
2339 if (isMask_32(MaskVal)) {
2340 uint32_t WidthVal = llvm::popcount(MaskVal);
2341 ReplaceNode(N, getBFE32(false, SDLoc(N), And.getOperand(0), ShiftVal,
2342 WidthVal));
2343 return;
2344 }
2345 }
2346 } else if (N->getOperand(0).getOpcode() == ISD::SHL) {
2347 SelectS_BFEFromShifts(N);
2348 return;
2349 }
2350 break;
2351 case ISD::SRA:
2352 if (N->getOperand(0).getOpcode() == ISD::SHL) {
2353 SelectS_BFEFromShifts(N);
2354 return;
2355 }
2356 break;
2357
2359 // sext_inreg (srl x, 16), i8 -> bfe_i32 x, 16, 8
2360 SDValue Src = N->getOperand(0);
2361 if (Src.getOpcode() != ISD::SRL)
2362 break;
2363
2364 const ConstantSDNode *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(1));
2365 if (!Amt)
2366 break;
2367
2368 unsigned Width = cast<VTSDNode>(N->getOperand(1))->getVT().getSizeInBits();
2369 ReplaceNode(N, getBFE32(true, SDLoc(N), Src.getOperand(0),
2370 Amt->getZExtValue(), Width));
2371 return;
2372 }
2373 }
2374
2375 SelectCode(N);
2376}
2377
2378bool AMDGPUDAGToDAGISel::isCBranchSCC(const SDNode *N) const {
2379 assert(N->getOpcode() == ISD::BRCOND);
2380 if (!N->hasOneUse())
2381 return false;
2382
2383 SDValue Cond = N->getOperand(1);
2384 if (Cond.getOpcode() == ISD::CopyToReg)
2385 Cond = Cond.getOperand(2);
2386
2387 if (Cond.getOpcode() != ISD::SETCC || !Cond.hasOneUse())
2388 return false;
2389
2390 MVT VT = Cond.getOperand(0).getSimpleValueType();
2391 if (VT == MVT::i32)
2392 return true;
2393
2394 if (VT == MVT::i64) {
2395 auto ST = static_cast<const GCNSubtarget *>(Subtarget);
2396
2397 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
2398 return (CC == ISD::SETEQ || CC == ISD::SETNE) && ST->hasScalarCompareEq64();
2399 }
2400
2401 return false;
2402}
2403
2404static SDValue combineBallotPattern(SDValue VCMP, bool &Negate) {
2405 assert(VCMP->getOpcode() == AMDGPUISD::SETCC);
2406 // Special case for amdgcn.ballot:
2407 // %Cond = i1 (and/or combination of i1 ISD::SETCCs)
2408 // %VCMP = i(WaveSize) AMDGPUISD::SETCC (ext %Cond), 0, setne/seteq
2409 // =>
2410 // Use i1 %Cond value instead of i(WaveSize) %VCMP.
2411 // This is possible because divergent ISD::SETCC is selected as V_CMP and
2412 // Cond becomes a i(WaveSize) full mask value.
2413 // Note that ballot doesn't use SETEQ condition but its easy to support it
2414 // here for completeness, so in this case Negate is set true on return.
2415 auto VCMP_CC = cast<CondCodeSDNode>(VCMP.getOperand(2))->get();
2416 if ((VCMP_CC == ISD::SETEQ || VCMP_CC == ISD::SETNE) &&
2417 isNullConstant(VCMP.getOperand(1))) {
2418
2419 auto Cond = VCMP.getOperand(0);
2420 if (ISD::isExtOpcode(Cond->getOpcode())) // Skip extension.
2421 Cond = Cond.getOperand(0);
2422
2423 if (isBoolSGPR(Cond)) {
2424 Negate = VCMP_CC == ISD::SETEQ;
2425 return Cond;
2426 }
2427 }
2428 return SDValue();
2429}
2430
2431void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
2432 SDValue Cond = N->getOperand(1);
2433
2434 if (Cond.isUndef()) {
2435 CurDAG->SelectNodeTo(N, AMDGPU::SI_BR_UNDEF, MVT::Other,
2436 N->getOperand(2), N->getOperand(0));
2437 return;
2438 }
2439
2440 const GCNSubtarget *ST = static_cast<const GCNSubtarget *>(Subtarget);
2441 const SIRegisterInfo *TRI = ST->getRegisterInfo();
2442
2443 bool UseSCCBr = isCBranchSCC(N) && isUniformBr(N);
2444 bool AndExec = !UseSCCBr;
2445 bool Negate = false;
2446
2447 if (Cond.getOpcode() == ISD::SETCC &&
2448 Cond->getOperand(0)->getOpcode() == AMDGPUISD::SETCC) {
2449 SDValue VCMP = Cond->getOperand(0);
2450 auto CC = cast<CondCodeSDNode>(Cond->getOperand(2))->get();
2451 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
2452 isNullConstant(Cond->getOperand(1)) &&
2453 // We may encounter ballot.i64 in wave32 mode on -O0.
2454 VCMP.getValueType().getSizeInBits() == ST->getWavefrontSize()) {
2455 // %VCMP = i(WaveSize) AMDGPUISD::SETCC ...
2456 // %C = i1 ISD::SETCC %VCMP, 0, setne/seteq
2457 // BRCOND i1 %C, %BB
2458 // =>
2459 // %VCMP = i(WaveSize) AMDGPUISD::SETCC ...
2460 // VCC = COPY i(WaveSize) %VCMP
2461 // S_CBRANCH_VCCNZ/VCCZ %BB
2462 Negate = CC == ISD::SETEQ;
2463 bool NegatedBallot = false;
2464 if (auto BallotCond = combineBallotPattern(VCMP, NegatedBallot)) {
2465 Cond = BallotCond;
2466 UseSCCBr = !BallotCond->isDivergent();
2467 Negate = Negate ^ NegatedBallot;
2468 } else {
2469 // TODO: don't use SCC here assuming that AMDGPUISD::SETCC is always
2470 // selected as V_CMP, but this may change for uniform condition.
2471 Cond = VCMP;
2472 UseSCCBr = false;
2473 }
2474 }
2475 // Cond is either V_CMP resulted from AMDGPUISD::SETCC or a combination of
2476 // V_CMPs resulted from ballot or ballot has uniform condition and SCC is
2477 // used.
2478 AndExec = false;
2479 }
2480
2481 unsigned BrOp =
2482 UseSCCBr ? (Negate ? AMDGPU::S_CBRANCH_SCC0 : AMDGPU::S_CBRANCH_SCC1)
2483 : (Negate ? AMDGPU::S_CBRANCH_VCCZ : AMDGPU::S_CBRANCH_VCCNZ);
2484 Register CondReg = UseSCCBr ? AMDGPU::SCC : TRI->getVCC();
2485 SDLoc SL(N);
2486
2487 if (AndExec) {
2488 // This is the case that we are selecting to S_CBRANCH_VCCNZ. We have not
2489 // analyzed what generates the vcc value, so we do not know whether vcc
2490 // bits for disabled lanes are 0. Thus we need to mask out bits for
2491 // disabled lanes.
2492 //
2493 // For the case that we select S_CBRANCH_SCC1 and it gets
2494 // changed to S_CBRANCH_VCCNZ in SIFixSGPRCopies, SIFixSGPRCopies calls
2495 // SIInstrInfo::moveToVALU which inserts the S_AND).
2496 //
2497 // We could add an analysis of what generates the vcc value here and omit
2498 // the S_AND when is unnecessary. But it would be better to add a separate
2499 // pass after SIFixSGPRCopies to do the unnecessary S_AND removal, so it
2500 // catches both cases.
2501 Cond = SDValue(CurDAG->getMachineNode(ST->isWave32() ? AMDGPU::S_AND_B32
2502 : AMDGPU::S_AND_B64,
2503 SL, MVT::i1,
2504 CurDAG->getRegister(ST->isWave32() ? AMDGPU::EXEC_LO
2505 : AMDGPU::EXEC,
2506 MVT::i1),
2507 Cond),
2508 0);
2509 }
2510
2511 SDValue VCC = CurDAG->getCopyToReg(N->getOperand(0), SL, CondReg, Cond);
2512 CurDAG->SelectNodeTo(N, BrOp, MVT::Other,
2513 N->getOperand(2), // Basic Block
2514 VCC.getValue(0));
2515}
2516
2517void AMDGPUDAGToDAGISel::SelectFP_EXTEND(SDNode *N) {
2518 if (Subtarget->hasSALUFloatInsts() && N->getValueType(0) == MVT::f32 &&
2519 !N->isDivergent()) {
2520 SDValue Src = N->getOperand(0);
2521 if (Src.getValueType() == MVT::f16) {
2522 if (isExtractHiElt(Src, Src)) {
2523 CurDAG->SelectNodeTo(N, AMDGPU::S_CVT_HI_F32_F16, N->getVTList(),
2524 {Src});
2525 return;
2526 }
2527 }
2528 }
2529
2530 SelectCode(N);
2531}
2532
2533void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode *N, unsigned IntrID) {
2534 // The address is assumed to be uniform, so if it ends up in a VGPR, it will
2535 // be copied to an SGPR with readfirstlane.
2536 unsigned Opc = IntrID == Intrinsic::amdgcn_ds_append ?
2537 AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
2538
2539 SDValue Chain = N->getOperand(0);
2540 SDValue Ptr = N->getOperand(2);
2541 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2542 MachineMemOperand *MMO = M->getMemOperand();
2543 bool IsGDS = M->getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
2544
2547 SDValue PtrBase = Ptr.getOperand(0);
2548 SDValue PtrOffset = Ptr.getOperand(1);
2549
2550 const APInt &OffsetVal = PtrOffset->getAsAPIntVal();
2551 if (isDSOffsetLegal(PtrBase, OffsetVal.getZExtValue())) {
2552 N = glueCopyToM0(N, PtrBase);
2553 Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i32);
2554 }
2555 }
2556
2557 if (!Offset) {
2558 N = glueCopyToM0(N, Ptr);
2559 Offset = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32);
2560 }
2561
2562 SDValue Ops[] = {
2563 Offset,
2564 CurDAG->getTargetConstant(IsGDS, SDLoc(), MVT::i32),
2565 Chain,
2566 N->getOperand(N->getNumOperands() - 1) // New glue
2567 };
2568
2569 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
2570 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
2571}
2572
2573// We need to handle this here because tablegen doesn't support matching
2574// instructions with multiple outputs.
2575void AMDGPUDAGToDAGISel::SelectDSBvhStackIntrinsic(SDNode *N) {
2576 unsigned Opc = AMDGPU::DS_BVH_STACK_RTN_B32;
2577 SDValue Ops[] = {N->getOperand(2), N->getOperand(3), N->getOperand(4),
2578 N->getOperand(5), N->getOperand(0)};
2579
2580 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2581 MachineMemOperand *MMO = M->getMemOperand();
2582 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
2583 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
2584}
2585
2586static unsigned gwsIntrinToOpcode(unsigned IntrID) {
2587 switch (IntrID) {
2588 case Intrinsic::amdgcn_ds_gws_init:
2589 return AMDGPU::DS_GWS_INIT;
2590 case Intrinsic::amdgcn_ds_gws_barrier:
2591 return AMDGPU::DS_GWS_BARRIER;
2592 case Intrinsic::amdgcn_ds_gws_sema_v:
2593 return AMDGPU::DS_GWS_SEMA_V;
2594 case Intrinsic::amdgcn_ds_gws_sema_br:
2595 return AMDGPU::DS_GWS_SEMA_BR;
2596 case Intrinsic::amdgcn_ds_gws_sema_p:
2597 return AMDGPU::DS_GWS_SEMA_P;
2598 case Intrinsic::amdgcn_ds_gws_sema_release_all:
2599 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
2600 default:
2601 llvm_unreachable("not a gws intrinsic");
2602 }
2603}
2604
2605void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) {
2606 if (!Subtarget->hasGWS() ||
2607 (IntrID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
2608 !Subtarget->hasGWSSemaReleaseAll())) {
2609 // Let this error.
2610 SelectCode(N);
2611 return;
2612 }
2613
2614 // Chain, intrinsic ID, vsrc, offset
2615 const bool HasVSrc = N->getNumOperands() == 4;
2616 assert(HasVSrc || N->getNumOperands() == 3);
2617
2618 SDLoc SL(N);
2619 SDValue BaseOffset = N->getOperand(HasVSrc ? 3 : 2);
2620 int ImmOffset = 0;
2621 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2622 MachineMemOperand *MMO = M->getMemOperand();
2623
2624 // Don't worry if the offset ends up in a VGPR. Only one lane will have
2625 // effect, so SIFixSGPRCopies will validly insert readfirstlane.
2626
2627 // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
2628 // offset field) % 64. Some versions of the programming guide omit the m0
2629 // part, or claim it's from offset 0.
2630 if (ConstantSDNode *ConstOffset = dyn_cast<ConstantSDNode>(BaseOffset)) {
2631 // If we have a constant offset, try to use the 0 in m0 as the base.
2632 // TODO: Look into changing the default m0 initialization value. If the
2633 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
2634 // the immediate offset.
2635 glueCopyToM0(N, CurDAG->getTargetConstant(0, SL, MVT::i32));
2636 ImmOffset = ConstOffset->getZExtValue();
2637 } else {
2638 if (CurDAG->isBaseWithConstantOffset(BaseOffset)) {
2639 ImmOffset = BaseOffset.getConstantOperandVal(1);
2640 BaseOffset = BaseOffset.getOperand(0);
2641 }
2642
2643 // Prefer to do the shift in an SGPR since it should be possible to use m0
2644 // as the result directly. If it's already an SGPR, it will be eliminated
2645 // later.
2646 SDNode *SGPROffset
2647 = CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL, MVT::i32,
2648 BaseOffset);
2649 // Shift to offset in m0
2650 SDNode *M0Base
2651 = CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,
2652 SDValue(SGPROffset, 0),
2653 CurDAG->getTargetConstant(16, SL, MVT::i32));
2654 glueCopyToM0(N, SDValue(M0Base, 0));
2655 }
2656
2657 SDValue Chain = N->getOperand(0);
2658 SDValue OffsetField = CurDAG->getTargetConstant(ImmOffset, SL, MVT::i32);
2659
2660 const unsigned Opc = gwsIntrinToOpcode(IntrID);
2662 if (HasVSrc)
2663 Ops.push_back(N->getOperand(2));
2664 Ops.push_back(OffsetField);
2665 Ops.push_back(Chain);
2666
2667 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
2668 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
2669}
2670
2671void AMDGPUDAGToDAGISel::SelectInterpP1F16(SDNode *N) {
2672 if (Subtarget->getLDSBankCount() != 16) {
2673 // This is a single instruction with a pattern.
2674 SelectCode(N);
2675 return;
2676 }
2677
2678 SDLoc DL(N);
2679
2680 // This requires 2 instructions. It is possible to write a pattern to support
2681 // this, but the generated isel emitter doesn't correctly deal with multiple
2682 // output instructions using the same physical register input. The copy to m0
2683 // is incorrectly placed before the second instruction.
2684 //
2685 // TODO: Match source modifiers.
2686 //
2687 // def : Pat <
2688 // (int_amdgcn_interp_p1_f16
2689 // (VOP3Mods f32:$src0, i32:$src0_modifiers),
2690 // (i32 timm:$attrchan), (i32 timm:$attr),
2691 // (i1 timm:$high), M0),
2692 // (V_INTERP_P1LV_F16 $src0_modifiers, VGPR_32:$src0, timm:$attr,
2693 // timm:$attrchan, 0,
2694 // (V_INTERP_MOV_F32 2, timm:$attr, timm:$attrchan), timm:$high)> {
2695 // let Predicates = [has16BankLDS];
2696 // }
2697
2698 // 16 bank LDS
2699 SDValue ToM0 = CurDAG->getCopyToReg(CurDAG->getEntryNode(), DL, AMDGPU::M0,
2700 N->getOperand(5), SDValue());
2701
2702 SDVTList VTs = CurDAG->getVTList(MVT::f32, MVT::Other);
2703
2704 SDNode *InterpMov =
2705 CurDAG->getMachineNode(AMDGPU::V_INTERP_MOV_F32, DL, VTs, {
2706 CurDAG->getTargetConstant(2, DL, MVT::i32), // P0
2707 N->getOperand(3), // Attr
2708 N->getOperand(2), // Attrchan
2709 ToM0.getValue(1) // In glue
2710 });
2711
2712 SDNode *InterpP1LV =
2713 CurDAG->getMachineNode(AMDGPU::V_INTERP_P1LV_F16, DL, MVT::f32, {
2714 CurDAG->getTargetConstant(0, DL, MVT::i32), // $src0_modifiers
2715 N->getOperand(1), // Src0
2716 N->getOperand(3), // Attr
2717 N->getOperand(2), // Attrchan
2718 CurDAG->getTargetConstant(0, DL, MVT::i32), // $src2_modifiers
2719 SDValue(InterpMov, 0), // Src2 - holds two f16 values selected by high
2720 N->getOperand(4), // high
2721 CurDAG->getTargetConstant(0, DL, MVT::i1), // $clamp
2722 CurDAG->getTargetConstant(0, DL, MVT::i32), // $omod
2723 SDValue(InterpMov, 1)
2724 });
2725
2726 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), SDValue(InterpP1LV, 0));
2727}
2728
2729void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
2730 unsigned IntrID = N->getConstantOperandVal(1);
2731 switch (IntrID) {
2732 case Intrinsic::amdgcn_ds_append:
2733 case Intrinsic::amdgcn_ds_consume: {
2734 if (N->getValueType(0) != MVT::i32)
2735 break;
2736 SelectDSAppendConsume(N, IntrID);
2737 return;
2738 }
2739 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2740 SelectDSBvhStackIntrinsic(N);
2741 return;
2742 }
2743
2744 SelectCode(N);
2745}
2746
2747void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) {
2748 unsigned IntrID = N->getConstantOperandVal(0);
2749 unsigned Opcode = AMDGPU::INSTRUCTION_LIST_END;
2750 SDNode *ConvGlueNode = N->getGluedNode();
2751 if (ConvGlueNode) {
2752 // FIXME: Possibly iterate over multiple glue nodes?
2753 assert(ConvGlueNode->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
2754 ConvGlueNode = ConvGlueNode->getOperand(0).getNode();
2755 ConvGlueNode =
2756 CurDAG->getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, {},
2757 MVT::Glue, SDValue(ConvGlueNode, 0));
2758 } else {
2759 ConvGlueNode = nullptr;
2760 }
2761 switch (IntrID) {
2762 case Intrinsic::amdgcn_wqm:
2763 Opcode = AMDGPU::WQM;
2764 break;
2765 case Intrinsic::amdgcn_softwqm:
2766 Opcode = AMDGPU::SOFT_WQM;
2767 break;
2768 case Intrinsic::amdgcn_wwm:
2769 case Intrinsic::amdgcn_strict_wwm:
2770 Opcode = AMDGPU::STRICT_WWM;
2771 break;
2772 case Intrinsic::amdgcn_strict_wqm:
2773 Opcode = AMDGPU::STRICT_WQM;
2774 break;
2775 case Intrinsic::amdgcn_interp_p1_f16:
2776 SelectInterpP1F16(N);
2777 return;
2778 case Intrinsic::amdgcn_inverse_ballot:
2779 switch (N->getOperand(1).getValueSizeInBits()) {
2780 case 32:
2781 Opcode = AMDGPU::S_INVERSE_BALLOT_U32;
2782 break;
2783 case 64:
2784 Opcode = AMDGPU::S_INVERSE_BALLOT_U64;
2785 break;
2786 default:
2787 llvm_unreachable("Unsupported size for inverse ballot mask.");
2788 }
2789 break;
2790 default:
2791 SelectCode(N);
2792 break;
2793 }
2794
2795 if (Opcode != AMDGPU::INSTRUCTION_LIST_END) {
2796 SDValue Src = N->getOperand(1);
2797 CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), {Src});
2798 }
2799
2800 if (ConvGlueNode) {
2801 SmallVector<SDValue, 4> NewOps(N->op_begin(), N->op_end());
2802 NewOps.push_back(SDValue(ConvGlueNode, 0));
2803 CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), NewOps);
2804 }
2805}
2806
2807void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) {
2808 unsigned IntrID = N->getConstantOperandVal(1);
2809 switch (IntrID) {
2810 case Intrinsic::amdgcn_ds_gws_init:
2811 case Intrinsic::amdgcn_ds_gws_barrier:
2812 case Intrinsic::amdgcn_ds_gws_sema_v:
2813 case Intrinsic::amdgcn_ds_gws_sema_br:
2814 case Intrinsic::amdgcn_ds_gws_sema_p:
2815 case Intrinsic::amdgcn_ds_gws_sema_release_all:
2816 SelectDS_GWS(N, IntrID);
2817 return;
2818 default:
2819 break;
2820 }
2821
2822 SelectCode(N);
2823}
2824
2825void AMDGPUDAGToDAGISel::SelectWAVE_ADDRESS(SDNode *N) {
2826 SDValue Log2WaveSize =
2827 CurDAG->getTargetConstant(Subtarget->getWavefrontSizeLog2(), SDLoc(N), MVT::i32);
2828 CurDAG->SelectNodeTo(N, AMDGPU::S_LSHR_B32, N->getVTList(),
2829 {N->getOperand(0), Log2WaveSize});
2830}
2831
2832void AMDGPUDAGToDAGISel::SelectSTACKRESTORE(SDNode *N) {
2833 SDValue SrcVal = N->getOperand(1);
2834 if (SrcVal.getValueType() != MVT::i32) {
2835 SelectCode(N); // Emit default error
2836 return;
2837 }
2838
2839 SDValue CopyVal;
2841 SDLoc SL(N);
2842
2843 if (SrcVal.getOpcode() == AMDGPUISD::WAVE_ADDRESS) {
2844 CopyVal = SrcVal.getOperand(0);
2845 } else {
2846 SDValue Log2WaveSize = CurDAG->getTargetConstant(
2847 Subtarget->getWavefrontSizeLog2(), SL, MVT::i32);
2848
2849 if (N->isDivergent()) {
2850 SrcVal = SDValue(CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL,
2851 MVT::i32, SrcVal),
2852 0);
2853 }
2854
2855 CopyVal = SDValue(CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,
2856 {SrcVal, Log2WaveSize}),
2857 0);
2858 }
2859
2860 SDValue CopyToSP = CurDAG->getCopyToReg(N->getOperand(0), SL, SP, CopyVal);
2861 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), CopyToSP);
2862}
2863
2864bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src,
2865 unsigned &Mods,
2866 bool IsCanonicalizing,
2867 bool AllowAbs) const {
2868 Mods = SISrcMods::NONE;
2869 Src = In;
2870
2871 if (Src.getOpcode() == ISD::FNEG) {
2872 Mods |= SISrcMods::NEG;
2873 Src = Src.getOperand(0);
2874 } else if (Src.getOpcode() == ISD::FSUB && IsCanonicalizing) {
2875 // Fold fsub [+-]0 into fneg. This may not have folded depending on the
2876 // denormal mode, but we're implicitly canonicalizing in a source operand.
2877 auto *LHS = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
2878 if (LHS && LHS->isZero()) {
2879 Mods |= SISrcMods::NEG;
2880 Src = Src.getOperand(1);
2881 }
2882 }
2883
2884 if (AllowAbs && Src.getOpcode() == ISD::FABS) {
2885 Mods |= SISrcMods::ABS;
2886 Src = Src.getOperand(0);
2887 }
2888
2889 return true;
2890}
2891
2892bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src,
2893 SDValue &SrcMods) const {
2894 unsigned Mods;
2895 if (SelectVOP3ModsImpl(In, Src, Mods, /*IsCanonicalizing=*/true,
2896 /*AllowAbs=*/true)) {
2897 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2898 return true;
2899 }
2900
2901 return false;
2902}
2903
2904bool AMDGPUDAGToDAGISel::SelectVOP3ModsNonCanonicalizing(
2905 SDValue In, SDValue &Src, SDValue &SrcMods) const {
2906 unsigned Mods;
2907 if (SelectVOP3ModsImpl(In, Src, Mods, /*IsCanonicalizing=*/false,
2908 /*AllowAbs=*/true)) {
2909 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2910 return true;
2911 }
2912
2913 return false;
2914}
2915
2916bool AMDGPUDAGToDAGISel::SelectVOP3BMods(SDValue In, SDValue &Src,
2917 SDValue &SrcMods) const {
2918 unsigned Mods;
2919 if (SelectVOP3ModsImpl(In, Src, Mods,
2920 /*IsCanonicalizing=*/true,
2921 /*AllowAbs=*/false)) {
2922 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2923 return true;
2924 }
2925
2926 return false;
2927}
2928
2929bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src) const {
2930 if (In.getOpcode() == ISD::FABS || In.getOpcode() == ISD::FNEG)
2931 return false;
2932
2933 Src = In;
2934 return true;
2935}
2936
2937bool AMDGPUDAGToDAGISel::SelectVINTERPModsImpl(SDValue In, SDValue &Src,
2938 SDValue &SrcMods,
2939 bool OpSel) const {
2940 unsigned Mods;
2941 if (SelectVOP3ModsImpl(In, Src, Mods,
2942 /*IsCanonicalizing=*/true,
2943 /*AllowAbs=*/false)) {
2944 if (OpSel)
2945 Mods |= SISrcMods::OP_SEL_0;
2946 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2947 return true;
2948 }
2949
2950 return false;
2951}
2952
2953bool AMDGPUDAGToDAGISel::SelectVINTERPMods(SDValue In, SDValue &Src,
2954 SDValue &SrcMods) const {
2955 return SelectVINTERPModsImpl(In, Src, SrcMods, /* OpSel */ false);
2956}
2957
2958bool AMDGPUDAGToDAGISel::SelectVINTERPModsHi(SDValue In, SDValue &Src,
2959 SDValue &SrcMods) const {
2960 return SelectVINTERPModsImpl(In, Src, SrcMods, /* OpSel */ true);
2961}
2962
2963bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src,
2964 SDValue &SrcMods, SDValue &Clamp,
2965 SDValue &Omod) const {
2966 SDLoc DL(In);
2967 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
2968 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
2969
2970 return SelectVOP3Mods(In, Src, SrcMods);
2971}
2972
2973bool AMDGPUDAGToDAGISel::SelectVOP3BMods0(SDValue In, SDValue &Src,
2974 SDValue &SrcMods, SDValue &Clamp,
2975 SDValue &Omod) const {
2976 SDLoc DL(In);
2977 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
2978 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
2979
2980 return SelectVOP3BMods(In, Src, SrcMods);
2981}
2982
2983bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In, SDValue &Src,
2984 SDValue &Clamp, SDValue &Omod) const {
2985 Src = In;
2986
2987 SDLoc DL(In);
2988 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
2989 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
2990
2991 return true;
2992}
2993
2994bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
2995 SDValue &SrcMods, bool IsDOT) const {
2996 unsigned Mods = SISrcMods::NONE;
2997 Src = In;
2998
2999 // TODO: Handle G_FSUB 0 as fneg
3000 if (Src.getOpcode() == ISD::FNEG) {
3002 Src = Src.getOperand(0);
3003 }
3004
3005 if (Src.getOpcode() == ISD::BUILD_VECTOR && Src.getNumOperands() == 2 &&
3006 (!IsDOT || !Subtarget->hasDOTOpSelHazard())) {
3007 unsigned VecMods = Mods;
3008
3009 SDValue Lo = stripBitcast(Src.getOperand(0));
3010 SDValue Hi = stripBitcast(Src.getOperand(1));
3011
3012 if (Lo.getOpcode() == ISD::FNEG) {
3013 Lo = stripBitcast(Lo.getOperand(0));
3014 Mods ^= SISrcMods::NEG;
3015 }
3016
3017 if (Hi.getOpcode() == ISD::FNEG) {
3018 Hi = stripBitcast(Hi.getOperand(0));
3019 Mods ^= SISrcMods::NEG_HI;
3020 }
3021
3022 if (isExtractHiElt(Lo, Lo))
3023 Mods |= SISrcMods::OP_SEL_0;
3024
3025 if (isExtractHiElt(Hi, Hi))
3026 Mods |= SISrcMods::OP_SEL_1;
3027
3028 unsigned VecSize = Src.getValueSizeInBits();
3029 Lo = stripExtractLoElt(Lo);
3030 Hi = stripExtractLoElt(Hi);
3031
3032 if (Lo.getValueSizeInBits() > VecSize) {
3034 (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
3035 MVT::getIntegerVT(VecSize), Lo);
3036 }
3037
3038 if (Hi.getValueSizeInBits() > VecSize) {
3040 (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
3041 MVT::getIntegerVT(VecSize), Hi);
3042 }
3043
3044 assert(Lo.getValueSizeInBits() <= VecSize &&
3045 Hi.getValueSizeInBits() <= VecSize);
3046
3047 if (Lo == Hi && !isInlineImmediate(Lo.getNode())) {
3048 // Really a scalar input. Just select from the low half of the register to
3049 // avoid packing.
3050
3051 if (VecSize == 32 || VecSize == Lo.getValueSizeInBits()) {
3052 Src = Lo;
3053 } else {
3054 assert(Lo.getValueSizeInBits() == 32 && VecSize == 64);
3055
3056 SDLoc SL(In);
3058 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, SL,
3059 Lo.getValueType()), 0);
3060 auto RC = Lo->isDivergent() ? AMDGPU::VReg_64RegClassID
3061 : AMDGPU::SReg_64RegClassID;
3062 const SDValue Ops[] = {
3063 CurDAG->getTargetConstant(RC, SL, MVT::i32),
3064 Lo, CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
3065 Undef, CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32) };
3066
3067 Src = SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, SL,
3068 Src.getValueType(), Ops), 0);
3069 }
3070 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3071 return true;
3072 }
3073
3074 if (VecSize == 64 && Lo == Hi && isa<ConstantFPSDNode>(Lo)) {
3075 uint64_t Lit = cast<ConstantFPSDNode>(Lo)->getValueAPF()
3076 .bitcastToAPInt().getZExtValue();
3077 if (AMDGPU::isInlinableLiteral32(Lit, Subtarget->hasInv2PiInlineImm())) {
3078 Src = CurDAG->getTargetConstant(Lit, SDLoc(In), MVT::i64);
3079 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3080 return true;
3081 }
3082 }
3083
3084 Mods = VecMods;
3085 }
3086
3087 // Packed instructions do not have abs modifiers.
3088 Mods |= SISrcMods::OP_SEL_1;
3089
3090 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3091 return true;
3092}
3093
3094bool AMDGPUDAGToDAGISel::SelectVOP3PModsDOT(SDValue In, SDValue &Src,
3095 SDValue &SrcMods) const {
3096 return SelectVOP3PMods(In, Src, SrcMods, true);
3097}
3098
3099bool AMDGPUDAGToDAGISel::SelectVOP3PModsNeg(SDValue In, SDValue &Src) const {
3100 const ConstantSDNode *C = cast<ConstantSDNode>(In);
3101 // Literal i1 value set in intrinsic, represents SrcMods for the next operand.
3102 // 1 promotes packed values to signed, 0 treats them as unsigned.
3103 assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value");
3104
3105 unsigned Mods = SISrcMods::OP_SEL_1;
3106 unsigned SrcSign = C->getZExtValue();
3107 if (SrcSign == 1)
3108 Mods ^= SISrcMods::NEG;
3109
3110 Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3111 return true;
3112}
3113
3114bool AMDGPUDAGToDAGISel::SelectWMMAOpSelVOP3PMods(SDValue In,
3115 SDValue &Src) const {
3116 const ConstantSDNode *C = cast<ConstantSDNode>(In);
3117 assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value");
3118
3119 unsigned Mods = SISrcMods::OP_SEL_1;
3120 unsigned SrcVal = C->getZExtValue();
3121 if (SrcVal == 1)
3122 Mods |= SISrcMods::OP_SEL_0;
3123
3124 Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3125 return true;
3126}
3127
3129 llvm::SelectionDAG *CurDAG,
3130 const SDLoc &DL) {
3131 unsigned DstRegClass;
3132 EVT DstTy;
3133 switch (Elts.size()) {
3134 case 8:
3135 DstRegClass = AMDGPU::VReg_256RegClassID;
3136 DstTy = MVT::v8i32;
3137 break;
3138 case 4:
3139 DstRegClass = AMDGPU::VReg_128RegClassID;
3140 DstTy = MVT::v4i32;
3141 break;
3142 case 2:
3143 DstRegClass = AMDGPU::VReg_64RegClassID;
3144 DstTy = MVT::v2i32;
3145 break;
3146 default:
3147 llvm_unreachable("unhandled Reg sequence size");
3148 }
3149
3151 Ops.push_back(CurDAG->getTargetConstant(DstRegClass, DL, MVT::i32));
3152 for (unsigned i = 0; i < Elts.size(); ++i) {
3153 Ops.push_back(Elts[i]);
3154 Ops.push_back(CurDAG->getTargetConstant(
3156 }
3157 return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, DstTy, Ops);
3158}
3159
3161 llvm::SelectionDAG *CurDAG,
3162 const SDLoc &DL) {
3163 SmallVector<SDValue, 8> PackedElts;
3164 assert("unhandled Reg sequence size" &&
3165 (Elts.size() == 8 || Elts.size() == 16));
3166
3167 // Pack 16-bit elements in pairs into 32-bit register. If both elements are
3168 // unpacked from 32-bit source use it, otherwise pack them using v_perm.
3169 for (unsigned i = 0; i < Elts.size(); i += 2) {
3170 SDValue LoSrc = stripExtractLoElt(stripBitcast(Elts[i]));
3171 SDValue HiSrc;
3172 if (isExtractHiElt(Elts[i + 1], HiSrc) && LoSrc == HiSrc) {
3173 PackedElts.push_back(HiSrc);
3174 } else {
3175 SDValue PackLoLo = CurDAG->getTargetConstant(0x05040100, DL, MVT::i32);
3176 MachineSDNode *Packed =
3177 CurDAG->getMachineNode(AMDGPU::V_PERM_B32_e64, DL, MVT::i32,
3178 {Elts[i + 1], Elts[i], PackLoLo});
3179 PackedElts.push_back(SDValue(Packed, 0));
3180 }
3181 }
3182
3183 return buildRegSequence32(PackedElts, CurDAG, DL);
3184}
3185
3187 llvm::SelectionDAG *CurDAG,
3188 const SDLoc &DL, unsigned ElementSize) {
3189 if (ElementSize == 16)
3190 return buildRegSequence16(Elts, CurDAG, DL);
3191 if (ElementSize == 32)
3192 return buildRegSequence32(Elts, CurDAG, DL);
3193 llvm_unreachable("Unhandled element size");
3194}
3195
3196static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods,
3198 llvm::SelectionDAG *CurDAG, const SDLoc &DL,
3199 unsigned ElementSize) {
3200 if (ModOpcode == ISD::FNEG) {
3201 Mods |= SISrcMods::NEG;
3202 // Check if all elements also have abs modifier
3203 SmallVector<SDValue, 8> NegAbsElts;
3204 for (auto El : Elts) {
3205 if (El.getOpcode() != ISD::FABS)
3206 break;
3207 NegAbsElts.push_back(El->getOperand(0));
3208 }
3209 if (Elts.size() != NegAbsElts.size()) {
3210 // Neg
3211 Src = SDValue(buildRegSequence(Elts, CurDAG, DL, ElementSize), 0);
3212 } else {
3213 // Neg and Abs
3214 Mods |= SISrcMods::NEG_HI;
3215 Src = SDValue(buildRegSequence(NegAbsElts, CurDAG, DL, ElementSize), 0);
3216 }
3217 } else {
3218 assert(ModOpcode == ISD::FABS);
3219 // Abs
3220 Mods |= SISrcMods::NEG_HI;
3221 Src = SDValue(buildRegSequence(Elts, CurDAG, DL, ElementSize), 0);
3222 }
3223}
3224
3225// Check all f16 elements for modifiers while looking through b32 and v2b16
3226// build vector, stop if element does not satisfy ModifierCheck.
3227static void
3229 std::function<bool(SDValue)> ModifierCheck) {
3230 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3231 if (auto *F16Pair =
3232 dyn_cast<BuildVectorSDNode>(stripBitcast(BV->getOperand(i)))) {
3233 for (unsigned i = 0; i < F16Pair->getNumOperands(); ++i) {
3234 SDValue ElF16 = stripBitcast(F16Pair->getOperand(i));
3235 if (!ModifierCheck(ElF16))
3236 break;
3237 }
3238 }
3239 }
3240}
3241
3242bool AMDGPUDAGToDAGISel::SelectWMMAModsF16Neg(SDValue In, SDValue &Src,
3243 SDValue &SrcMods) const {
3244 Src = In;
3245 unsigned Mods = SISrcMods::OP_SEL_1;
3246
3247 // mods are on f16 elements
3248 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3250
3251 checkWMMAElementsModifiersF16(BV, [&](SDValue Element) -> bool {
3252 if (Element.getOpcode() != ISD::FNEG)
3253 return false;
3254 EltsF16.push_back(Element.getOperand(0));
3255 return true;
3256 });
3257
3258 // All elements have neg modifier
3259 if (BV->getNumOperands() * 2 == EltsF16.size()) {
3260 Src = SDValue(buildRegSequence16(EltsF16, CurDAG, SDLoc(In)), 0);
3261 Mods |= SISrcMods::NEG;
3262 Mods |= SISrcMods::NEG_HI;
3263 }
3264 }
3265
3266 // mods are on v2f16 elements
3267 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3268 SmallVector<SDValue, 8> EltsV2F16;
3269 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3270 SDValue ElV2f16 = stripBitcast(BV->getOperand(i));
3271 // Based on first element decide which mod we match, neg or abs
3272 if (ElV2f16.getOpcode() != ISD::FNEG)
3273 break;
3274 EltsV2F16.push_back(ElV2f16.getOperand(0));
3275 }
3276
3277 // All pairs of elements have neg modifier
3278 if (BV->getNumOperands() == EltsV2F16.size()) {
3279 Src = SDValue(buildRegSequence32(EltsV2F16, CurDAG, SDLoc(In)), 0);
3280 Mods |= SISrcMods::NEG;
3281 Mods |= SISrcMods::NEG_HI;
3282 }
3283 }
3284
3285 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3286 return true;
3287}
3288
3289bool AMDGPUDAGToDAGISel::SelectWMMAModsF16NegAbs(SDValue In, SDValue &Src,
3290 SDValue &SrcMods) const {
3291 Src = In;
3292 unsigned Mods = SISrcMods::OP_SEL_1;
3293 unsigned ModOpcode;
3294
3295 // mods are on f16 elements
3296 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3298 checkWMMAElementsModifiersF16(BV, [&](SDValue ElF16) -> bool {
3299 // Based on first element decide which mod we match, neg or abs
3300 if (EltsF16.empty())
3301 ModOpcode = (ElF16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3302 if (ElF16.getOpcode() != ModOpcode)
3303 return false;
3304 EltsF16.push_back(ElF16.getOperand(0));
3305 return true;
3306 });
3307
3308 // All elements have ModOpcode modifier
3309 if (BV->getNumOperands() * 2 == EltsF16.size())
3310 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF16, Src, CurDAG, SDLoc(In),
3311 16);
3312 }
3313
3314 // mods are on v2f16 elements
3315 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3316 SmallVector<SDValue, 8> EltsV2F16;
3317
3318 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3319 SDValue ElV2f16 = stripBitcast(BV->getOperand(i));
3320 // Based on first element decide which mod we match, neg or abs
3321 if (EltsV2F16.empty())
3322 ModOpcode = (ElV2f16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3323 if (ElV2f16->getOpcode() != ModOpcode)
3324 break;
3325 EltsV2F16.push_back(ElV2f16->getOperand(0));
3326 }
3327
3328 // All elements have ModOpcode modifier
3329 if (BV->getNumOperands() == EltsV2F16.size())
3330 selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, CurDAG, SDLoc(In),
3331 32);
3332 }
3333
3334 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3335 return true;
3336}
3337
3338bool AMDGPUDAGToDAGISel::SelectWMMAModsF32NegAbs(SDValue In, SDValue &Src,
3339 SDValue &SrcMods) const {
3340 Src = In;
3341 unsigned Mods = SISrcMods::OP_SEL_1;
3343
3344 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3345 assert(BV->getNumOperands() > 0);
3346 // Based on first element decide which mod we match, neg or abs
3347 SDValue ElF32 = stripBitcast(BV->getOperand(0));
3348 unsigned ModOpcode =
3349 (ElF32.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3350 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3351 SDValue ElF32 = stripBitcast(BV->getOperand(i));
3352 if (ElF32.getOpcode() != ModOpcode)
3353 break;
3354 EltsF32.push_back(ElF32.getOperand(0));
3355 }
3356
3357 // All elements had ModOpcode modifier
3358 if (BV->getNumOperands() == EltsF32.size())
3359 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, CurDAG, SDLoc(In),
3360 32);
3361 }
3362
3363 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3364 return true;
3365}
3366
3367bool AMDGPUDAGToDAGISel::SelectWMMAVISrc(SDValue In, SDValue &Src) const {
3368 if (auto *BV = dyn_cast<BuildVectorSDNode>(In)) {
3369 BitVector UndefElements;
3370 if (SDValue Splat = BV->getSplatValue(&UndefElements))
3371 if (isInlineImmediate(Splat.getNode())) {
3372 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Splat)) {
3373 unsigned Imm = C->getAPIntValue().getSExtValue();
3374 Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32);
3375 return true;
3376 }
3377 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Splat)) {
3378 unsigned Imm = C->getValueAPF().bitcastToAPInt().getSExtValue();
3379 Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32);
3380 return true;
3381 }
3382 llvm_unreachable("unhandled Constant node");
3383 }
3384 }
3385
3386 // 16 bit splat
3387 SDValue SplatSrc32 = stripBitcast(In);
3388 if (auto *SplatSrc32BV = dyn_cast<BuildVectorSDNode>(SplatSrc32))
3389 if (SDValue Splat32 = SplatSrc32BV->getSplatValue()) {
3390 SDValue SplatSrc16 = stripBitcast(Splat32);
3391 if (auto *SplatSrc16BV = dyn_cast<BuildVectorSDNode>(SplatSrc16))
3392 if (SDValue Splat = SplatSrc16BV->getSplatValue()) {
3393 const SIInstrInfo *TII = Subtarget->getInstrInfo();
3394 std::optional<APInt> RawValue;
3395 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Splat))
3396 RawValue = C->getValueAPF().bitcastToAPInt();
3397 else if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Splat))
3398 RawValue = C->getAPIntValue();
3399
3400 if (RawValue.has_value()) {
3401 EVT VT = In.getValueType().getScalarType();
3402 if (VT.getSimpleVT() == MVT::f16 || VT.getSimpleVT() == MVT::bf16) {
3403 APFloat FloatVal(VT.getSimpleVT() == MVT::f16
3406 RawValue.value());
3407 if (TII->isInlineConstant(FloatVal)) {
3408 Src = CurDAG->getTargetConstant(RawValue.value(), SDLoc(In),
3409 MVT::i16);
3410 return true;
3411 }
3412 } else if (VT.getSimpleVT() == MVT::i16) {
3413 if (TII->isInlineConstant(RawValue.value())) {
3414 Src = CurDAG->getTargetConstant(RawValue.value(), SDLoc(In),
3415 MVT::i16);
3416 return true;
3417 }
3418 } else
3419 llvm_unreachable("unknown 16-bit type");
3420 }
3421 }
3422 }
3423
3424 return false;
3425}
3426
3427bool AMDGPUDAGToDAGISel::SelectSWMMACIndex8(SDValue In, SDValue &Src,
3428 SDValue &IndexKey) const {
3429 unsigned Key = 0;
3430 Src = In;
3431
3432 if (In.getOpcode() == ISD::SRL) {
3433 const llvm::SDValue &ShiftSrc = In.getOperand(0);
3434 ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(In.getOperand(1));
3435 if (ShiftSrc.getValueType().getSizeInBits() == 32 && ShiftAmt &&
3436 ShiftAmt->getZExtValue() % 8 == 0) {
3437 Key = ShiftAmt->getZExtValue() / 8;
3438 Src = ShiftSrc;
3439 }
3440 }
3441
3442 IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);
3443 return true;
3444}
3445
3446bool AMDGPUDAGToDAGISel::SelectSWMMACIndex16(SDValue In, SDValue &Src,
3447 SDValue &IndexKey) const {
3448 unsigned Key = 0;
3449 Src = In;
3450
3451 if (In.getOpcode() == ISD::SRL) {
3452 const llvm::SDValue &ShiftSrc = In.getOperand(0);
3453 ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(In.getOperand(1));
3454 if (ShiftSrc.getValueType().getSizeInBits() == 32 && ShiftAmt &&
3455 ShiftAmt->getZExtValue() == 16) {
3456 Key = 1;
3457 Src = ShiftSrc;
3458 }
3459 }
3460
3461 IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);
3462 return true;
3463}
3464
3465bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src,
3466 SDValue &SrcMods) const {
3467 Src = In;
3468 // FIXME: Handle op_sel
3469 SrcMods = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32);
3470 return true;
3471}
3472
3473bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods(SDValue In, SDValue &Src,
3474 SDValue &SrcMods) const {
3475 // FIXME: Handle op_sel
3476 return SelectVOP3Mods(In, Src, SrcMods);
3477}
3478
3479// The return value is not whether the match is possible (which it always is),
3480// but whether or not it a conversion is really used.
3481bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src,
3482 unsigned &Mods) const {
3483 Mods = 0;
3484 SelectVOP3ModsImpl(In, Src, Mods);
3485
3486 if (Src.getOpcode() == ISD::FP_EXTEND) {
3487 Src = Src.getOperand(0);
3488 assert(Src.getValueType() == MVT::f16);
3489 Src = stripBitcast(Src);
3490
3491 // Be careful about folding modifiers if we already have an abs. fneg is
3492 // applied last, so we don't want to apply an earlier fneg.
3493 if ((Mods & SISrcMods::ABS) == 0) {
3494 unsigned ModsTmp;
3495 SelectVOP3ModsImpl(Src, Src, ModsTmp);
3496
3497 if ((ModsTmp & SISrcMods::NEG) != 0)
3498 Mods ^= SISrcMods::NEG;
3499
3500 if ((ModsTmp & SISrcMods::ABS) != 0)
3501 Mods |= SISrcMods::ABS;
3502 }
3503
3504 // op_sel/op_sel_hi decide the source type and source.
3505 // If the source's op_sel_hi is set, it indicates to do a conversion from fp16.
3506 // If the sources's op_sel is set, it picks the high half of the source
3507 // register.
3508
3509 Mods |= SISrcMods::OP_SEL_1;
3510 if (isExtractHiElt(Src, Src)) {
3511 Mods |= SISrcMods::OP_SEL_0;
3512
3513 // TODO: Should we try to look for neg/abs here?
3514 }
3515
3516 return true;
3517 }
3518
3519 return false;
3520}
3521
3522bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsExt(SDValue In, SDValue &Src,
3523 SDValue &SrcMods) const {
3524 unsigned Mods = 0;
3525 if (!SelectVOP3PMadMixModsImpl(In, Src, Mods))
3526 return false;
3527 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3528 return true;
3529}
3530
3531bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src,
3532 SDValue &SrcMods) const {
3533 unsigned Mods = 0;
3534 SelectVOP3PMadMixModsImpl(In, Src, Mods);
3535 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3536 return true;
3537}
3538
3539SDValue AMDGPUDAGToDAGISel::getHi16Elt(SDValue In) const {
3540 if (In.isUndef())
3541 return CurDAG->getUNDEF(MVT::i32);
3542
3543 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(In)) {
3544 SDLoc SL(In);
3545 return CurDAG->getConstant(C->getZExtValue() << 16, SL, MVT::i32);
3546 }
3547
3548 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(In)) {
3549 SDLoc SL(In);
3550 return CurDAG->getConstant(
3551 C->getValueAPF().bitcastToAPInt().getZExtValue() << 16, SL, MVT::i32);
3552 }
3553
3554 SDValue Src;
3555 if (isExtractHiElt(In, Src))
3556 return Src;
3557
3558 return SDValue();
3559}
3560
3561bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
3563
3564 const SIRegisterInfo *SIRI =
3565 static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
3566 const SIInstrInfo * SII =
3567 static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
3568
3569 unsigned Limit = 0;
3570 bool AllUsesAcceptSReg = true;
3571 for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
3572 Limit < 10 && U != E; ++U, ++Limit) {
3573 const TargetRegisterClass *RC = getOperandRegClass(*U, U.getOperandNo());
3574
3575 // If the register class is unknown, it could be an unknown
3576 // register class that needs to be an SGPR, e.g. an inline asm
3577 // constraint
3578 if (!RC || SIRI->isSGPRClass(RC))
3579 return false;
3580
3581 if (RC != &AMDGPU::VS_32RegClass && RC != &AMDGPU::VS_64RegClass) {
3582 AllUsesAcceptSReg = false;
3583 SDNode * User = *U;
3584 if (User->isMachineOpcode()) {
3585 unsigned Opc = User->getMachineOpcode();
3586 const MCInstrDesc &Desc = SII->get(Opc);
3587 if (Desc.isCommutable()) {
3588 unsigned OpIdx = Desc.getNumDefs() + U.getOperandNo();
3589 unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex;
3590 if (SII->findCommutedOpIndices(Desc, OpIdx, CommuteIdx1)) {
3591 unsigned CommutedOpNo = CommuteIdx1 - Desc.getNumDefs();
3592 const TargetRegisterClass *CommutedRC = getOperandRegClass(*U, CommutedOpNo);
3593 if (CommutedRC == &AMDGPU::VS_32RegClass ||
3594 CommutedRC == &AMDGPU::VS_64RegClass)
3595 AllUsesAcceptSReg = true;
3596 }
3597 }
3598 }
3599 // If "AllUsesAcceptSReg == false" so far we haven't succeeded
3600 // commuting current user. This means have at least one use
3601 // that strictly require VGPR. Thus, we will not attempt to commute
3602 // other user instructions.
3603 if (!AllUsesAcceptSReg)
3604 break;
3605 }
3606 }
3607 return !AllUsesAcceptSReg && (Limit < 10);
3608}
3609
3610bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode *N) const {
3611 auto Ld = cast<LoadSDNode>(N);
3612
3613 const MachineMemOperand *MMO = Ld->getMemOperand();
3614 if (N->isDivergent() && !AMDGPUInstrInfo::isUniformMMO(MMO))
3615 return false;
3616
3617 return MMO->getSize().hasValue() &&
3618 Ld->getAlign() >=
3619 Align(std::min(MMO->getSize().getValue().getKnownMinValue(),
3620 uint64_t(4))) &&
3621 ((Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
3622 Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ||
3623 (Subtarget->getScalarizeGlobalBehavior() &&
3624 Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
3625 Ld->isSimple() &&
3626 static_cast<const SITargetLowering *>(getTargetLowering())
3627 ->isMemOpHasNoClobberedMemOperand(N)));
3628}
3629
3632 *static_cast<const AMDGPUTargetLowering*>(getTargetLowering());
3633 bool IsModified = false;
3634 do {
3635 IsModified = false;
3636
3637 // Go over all selected nodes and try to fold them a bit more
3639 while (Position != CurDAG->allnodes_end()) {
3640 SDNode *Node = &*Position++;
3641 MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(Node);
3642 if (!MachineNode)
3643 continue;
3644
3645 SDNode *ResNode = Lowering.PostISelFolding(MachineNode, *CurDAG);
3646 if (ResNode != Node) {
3647 if (ResNode)
3648 ReplaceUses(Node, ResNode);
3649 IsModified = true;
3650 }
3651 }
3653 } while (IsModified);
3654}
3655
3657 CodeGenOptLevel OptLevel)
3659 ID, std::make_unique<AMDGPUDAGToDAGISel>(TM, OptLevel)) {}
3660
unsigned const MachineRegisterInfo * MRI
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool getBaseWithOffsetUsingSplitOR(SelectionDAG &DAG, SDValue Addr, SDValue &N0, SDValue &N1)
static MachineSDNode * buildRegSequence32(SmallVectorImpl< SDValue > &Elts, llvm::SelectionDAG *CurDAG, const SDLoc &DL)
static SDValue matchZExtFromI32(SDValue Op)
static unsigned gwsIntrinToOpcode(unsigned IntrID)
static MachineSDNode * buildRegSequence(SmallVectorImpl< SDValue > &Elts, llvm::SelectionDAG *CurDAG, const SDLoc &DL, unsigned ElementSize)
static SDValue SelectSAddrFI(SelectionDAG *CurDAG, SDValue SAddr)
static MemSDNode * findMemSDNode(SDNode *N)
static bool isNoUnsignedWrap(SDValue Addr)
static MachineSDNode * buildRegSequence16(SmallVectorImpl< SDValue > &Elts, llvm::SelectionDAG *CurDAG, const SDLoc &DL)
static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods, SmallVectorImpl< SDValue > &Elts, SDValue &Src, llvm::SelectionDAG *CurDAG, const SDLoc &DL, unsigned ElementSize)
static bool IsCopyFromSGPR(const SIRegisterInfo &TRI, SDValue Val)
static SDValue combineBallotPattern(SDValue VCMP, bool &Negate)
static void checkWMMAElementsModifiersF16(BuildVectorSDNode *BV, std::function< bool(SDValue)> ModifierCheck)
Defines an instruction selector for the AMDGPU target.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In, Register &Out)
static unsigned gwsIntrinToOpcode(unsigned IntrID)
static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods, SmallVectorImpl< Register > &Elts, Register &Src, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
Provides AMDGPU specific target descriptions.
Base class for AMDGPU specific classes of TargetSubtarget.
The AMDGPU TargetMachine interface definition for hw codegen targets.
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Addr
uint64_t Size
#define F(x, y, z)
Definition: MD5.cpp:55
unsigned const TargetRegisterInfo * TRI
if(VerifyEach)
FunctionAnalysisManager FAM
const char LLVMTargetMachineRef TM
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:59
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
pre isel intrinsic Pre ISel Intrinsic Lowering
Provides R600 specific target descriptions.
Interface definition for R600RegisterInfo.
const SmallVectorImpl< MachineOperand > & Cond
SI DAG Lowering interface definition.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
LLVM IR instance of the generic uniformity analysis.
Value * RHS
Value * LHS
support::ulittle16_t & Lo
Definition: aarch32.cpp:206
support::ulittle16_t & Hi
Definition: aarch32.cpp:205
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
AMDGPUDAGToDAGISelLegacy(TargetMachine &TM, CodeGenOptLevel OptLevel)
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
StringRef getPassName() const override
getPassName - Return a nice clean name for a pass.
AMDGPU specific code to select AMDGPU machine instructions for SelectionDAG operations.
void SelectBuildVector(SDNode *N, unsigned RegClassID)
void Select(SDNode *N) override
Main hook for targets to transform nodes into machine nodes.
bool runOnMachineFunction(MachineFunction &MF) override
void PreprocessISelDAG() override
PreprocessISelDAG - This hook allows targets to hack on the graph before instruction selection starts...
void PostprocessISelDAG() override
PostprocessISelDAG() - This hook allows the target to hack on the graph right after selection.
AMDGPUDAGToDAGISel()=delete
bool matchLoadD16FromBuildVector(SDNode *N) const
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
AMDGPUISelDAGToDAGPass(TargetMachine &TM)
static bool isUniformMMO(const MachineMemOperand *MMO)
unsigned getWavefrontSizeLog2() const
bool hasInv2PiInlineImm() const
static SDValue stripBitcast(SDValue Val)
static int64_t getNullPointerValue(unsigned AddrSpace)
Get the integer value of a null pointer in the given address space.
Class for arbitrary precision integers.
Definition: APInt.h:77
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:213
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1499
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1521
unsigned countr_one() const
Count the number of trailing one bits.
Definition: APInt.h:1614
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:405
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:229
A "pseudo-class" with methods for operating on BUILD_VECTORs.
SDValue getSplatValue(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted value or a null value if this is not a splat.
uint64_t getZExtValue() const
int64_t getSExtValue() const
This class represents an Operation in the Expression.
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:279
Legacy analysis pass which computes a DominatorTree.
Definition: Dominators.h:317
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:311
int getLDSBankCount() const
Definition: GCNSubtarget.h:336
bool hasUsableDSOffset() const
True if the offset field of DS instructions works as expected.
Definition: GCNSubtarget.h:464
bool unsafeDSOffsetFoldingEnabled() const
Definition: GCNSubtarget.h:468
bool hasFlatInstOffsets() const
Definition: GCNSubtarget.h:623
bool hasDLInsts() const
Definition: GCNSubtarget.h:764
const SIInstrInfo * getInstrInfo() const override
Definition: GCNSubtarget.h:262
unsigned getConstantBusLimit(unsigned Opcode) const
bool hasMADIntraFwdBug() const
bool privateMemoryResourceIsRangeChecked() const
Definition: GCNSubtarget.h:549
bool hasSignedScratchOffsets() const
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:274
bool hasDOTOpSelHazard() const
bool d16PreservesUnusedBits() const
Definition: GCNSubtarget.h:691
bool hasRestrictedSOffset() const
bool hasFlatSegmentOffsetBug() const
Definition: GCNSubtarget.h:679
bool getScalarizeGlobalBehavior() const
Definition: GCNSubtarget.h:946
bool ldsRequiresM0Init() const
Return if most LDS instructions have an m0 use that require m0 to be initialized.
Definition: GCNSubtarget.h:701
bool hasFlatScratchSVSSwizzleBug() const
bool hasGWS() const
bool useFlatForGlobal() const
Definition: GCNSubtarget.h:527
Generation getGeneration() const
Definition: GCNSubtarget.h:313
bool hasGWSSemaReleaseAll() const
Definition: GCNSubtarget.h:715
bool hasAddr64() const
Definition: GCNSubtarget.h:377
bool hasAddNoCarry() const
Definition: GCNSubtarget.h:723
bool hasSALUFloatInsts() const
void checkSubtargetFeatures(const Function &F) const
Diagnose inconsistent subtarget features before attempting to codegen function F.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
bool hasValue() const
TypeSize getValue() const
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:571
SmallVector< LoopT *, 4 > getLoopsInPreorder() const
Return all of the loops in the function in preorder across the loop nests, with siblings in forward p...
The legacy pass manager's analysis pass to compute loop information.
Definition: LoopInfo.h:598
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
const Triple & getTargetTriple() const
Machine Value Type.
static MVT getIntegerVT(unsigned BitWidth)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
An SDNode that represents everything that will be needed to construct a MachineInstr.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getChain() const
EVT getMemoryVT() const
Return the type of the in-memory value.
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:111
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
This class provides iterator support for SDUse operands that use a specific SDNode.
Represents one node in the SelectionDAG.
const APInt & getAsAPIntVal() const
Helper method returns the APInt value of a ConstantSDNode.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool isDivergent() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumOperands() const
Return the number of values used by this operation.
const SDValue & getOperand(unsigned Num) const
bool isPredecessorOf(const SDNode *N) const
Return true if this node is a predecessor of N.
static use_iterator use_end()
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
const SDValue & getOperand(unsigned i) const
uint64_t getConstantOperandVal(unsigned i) const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx0, unsigned &SrcOpIdx1) const override
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
const TargetRegisterClass * getRegClass(unsigned RCID) const
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isSGPRClass(const TargetRegisterClass *RC)
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
SelectionDAGISel - This is the common base class used for SelectionDAG-based pattern-matching instruc...
std::unique_ptr< FunctionLoweringInfo > FuncInfo
const TargetLowering * TLI
MachineFunction * MF
const TargetInstrInfo * TII
void ReplaceUses(SDValue F, SDValue T)
ReplaceUses - replace all uses of the old node F with the use of the new node T.
void ReplaceNode(SDNode *F, SDNode *T)
Replace all uses of F with T, then remove F from the DAG.
virtual bool runOnMachineFunction(MachineFunction &mf)
const TargetLowering * getTargetLowering() const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:227
const TargetSubtargetInfo & getSubtarget() const
Definition: SelectionDAG.h:488
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
SDNode * SelectNodeTo(SDNode *N, unsigned MachineOpc, EVT VT)
These are used for target selectors to mutate the specified node to have the specified return type,...
SDNode * MorphNodeTo(SDNode *N, unsigned Opc, SDVTList VTs, ArrayRef< SDValue > Ops)
This mutates the specified node to have the specified return type, opcode, and operands.
allnodes_const_iterator allnodes_begin() const
Definition: SelectionDAG.h:545
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
allnodes_const_iterator allnodes_end() const
Definition: SelectionDAG.h:546
void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
SDValue getTargetFrameIndex(int FI, EVT VT)
Definition: SelectionDAG.h:741
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
bool SignBitIsZero(SDValue Op, unsigned Depth=0) const
Return true if the sign bit of Op is known to be zero.
SDValue getRegister(unsigned Reg, EVT VT)
void RemoveDeadNodes()
This method deletes all unreachable nodes in the SelectionDAG.
void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
const TargetMachine & getTarget() const
Definition: SelectionDAG.h:487
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, unsigned Reg, SDValue N)
Definition: SelectionDAG.h:787
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:690
bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:481
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition: SelectionDAG.h:568
ilist< SDNode >::iterator allnodes_iterator
Definition: SelectionDAG.h:548
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
static const unsigned CommuteAnyOperandIndex
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
const Triple & getTargetTriple() const
unsigned getID() const
Return the register class ID number.
ArchType getArch() const
Get the parsed architecture type of this triple.
Definition: Triple.h:373
Legacy analysis pass which computes a CycleInfo.
LLVM Value Representation.
Definition: Value.h:74
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:168
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
@ CLAMP
CLAMP value between 0.0 and 1.0.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
Key
PAL metadata keys.
std::optional< int64_t > getSMRDEncodedLiteralOffset32(const MCSubtargetInfo &ST, int64_t ByteOffset)
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isValid32BitLiteral(uint64_t Val, bool IsFP64)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST)
std::optional< int64_t > getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset, bool IsBuffer, bool HasSOffset)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:764
@ STACKRESTORE
STACKRESTORE has two operands, an input chain and a pointer to restore to it returns an output chain.
Definition: ISDOpcodes.h:1147
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:257
@ ConstantFP
Definition: ISDOpcodes.h:77
@ ADDC
Carry-setting nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:276
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition: ISDOpcodes.h:495
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:246
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1052
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:491
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition: ISDOpcodes.h:205
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:804
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:397
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:917
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition: ISDOpcodes.h:236
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
Definition: ISDOpcodes.h:954
@ CONVERGENCECTRL_GLUE
Definition: ISDOpcodes.h:1417
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition: ISDOpcodes.h:628
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:944
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition: ISDOpcodes.h:508
@ UNDEF
UNDEF - An undefined node.
Definition: ISDOpcodes.h:218
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition: ISDOpcodes.h:215
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:719
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:543
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:209
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:794
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:986
@ TargetFrameIndex
Definition: ISDOpcodes.h:172
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:812
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:902
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:310
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:694
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:190
@ ADDE
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:286
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:883
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:800
@ BRCOND
BRCOND - Conditional branch.
Definition: ISDOpcodes.h:1091
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:198
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition: ISDOpcodes.h:523
bool isExtOpcode(unsigned Opcode)
Definition: ISDOpcodes.h:1625
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1554
@ Undef
Value of the register doesn't matter.
Reg
All possible values of the reg field in the ModR/M byte.
constexpr const char32_t SBase
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Offset
Definition: DWP.cpp:480
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
constexpr bool isMask_32(uint32_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition: MathExtras.h:251
bool isBoolSGPR(SDValue V)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:138
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
CodeGenOptLevel
Code generation optimization level.
Definition: CodeGen.h:54
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:143
FunctionPass * createAMDGPUISelDag(TargetMachine &TM, CodeGenOptLevel OptLevel)
This pass converts a legalized DAG into a AMDGPU-specific.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ Add
Sum of integers.
unsigned M0(unsigned Val)
Definition: VE.h:375
Implement std::hash so that hash_code can be used in STL containers.
Definition: BitVector.h:858
#define N
static const fltSemantics & IEEEhalf() LLVM_READNONE
Definition: APFloat.cpp:269
static const fltSemantics & BFloat() LLVM_READNONE
Definition: APFloat.cpp:270
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Description of the encoding of one expression Op.
Extended Value Type.
Definition: ValueTypes.h:34
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:358
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:370
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:306
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition: ValueTypes.h:246
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:318
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:156
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:326
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition: KnownBits.h:290
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition: KnownBits.h:134
APInt getMinValue() const
Return the minimal unsigned value possible given these KnownBits.
Definition: KnownBits.h:118
static KnownBits computeForAddSub(bool Add, bool NSW, bool NUW, const KnownBits &LHS, const KnownBits &RHS)
Compute known bits resulting from adding LHS and RHS.
Definition: KnownBits.cpp:51
static unsigned getSubRegFromChannel(unsigned Channel)
This represents a list of ValueType's that has been intern'd by a SelectionDAG.