LLVM 20.0.0git
AMDGPUISelDAGToDAG.cpp
Go to the documentation of this file.
1//===-- AMDGPUISelDAGToDAG.cpp - A dag to dag inst selector for AMDGPU ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//==-----------------------------------------------------------------------===//
8//
9/// \file
10/// Defines an instruction selector for the AMDGPU target.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPUISelDAGToDAG.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPUSubtarget.h"
18#include "AMDGPUTargetMachine.h"
21#include "R600RegisterInfo.h"
22#include "SIISelLowering.h"
30#include "llvm/IR/IntrinsicsAMDGPU.h"
33
34#ifdef EXPENSIVE_CHECKS
36#include "llvm/IR/Dominators.h"
37#endif
38
39#define DEBUG_TYPE "amdgpu-isel"
40
41using namespace llvm;
42
43//===----------------------------------------------------------------------===//
44// Instruction Selector Implementation
45//===----------------------------------------------------------------------===//
46
47namespace {
48static SDValue stripBitcast(SDValue Val) {
49 return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val;
50}
51
52// Figure out if this is really an extract of the high 16-bits of a dword.
53static bool isExtractHiElt(SDValue In, SDValue &Out) {
54 In = stripBitcast(In);
55
56 if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
57 if (ConstantSDNode *Idx = dyn_cast<ConstantSDNode>(In.getOperand(1))) {
58 if (!Idx->isOne())
59 return false;
60 Out = In.getOperand(0);
61 return true;
62 }
63 }
64
65 if (In.getOpcode() != ISD::TRUNCATE)
66 return false;
67
68 SDValue Srl = In.getOperand(0);
69 if (Srl.getOpcode() == ISD::SRL) {
70 if (ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
71 if (ShiftAmt->getZExtValue() == 16) {
72 Out = stripBitcast(Srl.getOperand(0));
73 return true;
74 }
75 }
76 }
77
78 return false;
79}
80
81// Look through operations that obscure just looking at the low 16-bits of the
82// same register.
83static SDValue stripExtractLoElt(SDValue In) {
84 if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
85 SDValue Idx = In.getOperand(1);
86 if (isNullConstant(Idx) && In.getValueSizeInBits() <= 32)
87 return In.getOperand(0);
88 }
89
90 if (In.getOpcode() == ISD::TRUNCATE) {
91 SDValue Src = In.getOperand(0);
92 if (Src.getValueType().getSizeInBits() == 32)
93 return stripBitcast(Src);
94 }
95
96 return In;
97}
98
99} // end anonymous namespace
100
102 "AMDGPU DAG->DAG Pattern Instruction Selection", false,
103 false)
105INITIALIZE_PASS_DEPENDENCY(AMDGPUPerfHintAnalysisLegacy)
107#ifdef EXPENSIVE_CHECKS
110#endif
112 "AMDGPU DAG->DAG Pattern Instruction Selection", false,
113 false)
114
115/// This pass converts a legalized DAG into a AMDGPU-specific
116// DAG, ready for instruction scheduling.
118 CodeGenOptLevel OptLevel) {
119 return new AMDGPUDAGToDAGISelLegacy(TM, OptLevel);
120}
121
123 CodeGenOptLevel OptLevel)
124 : SelectionDAGISel(TM, OptLevel) {
125 EnableLateStructurizeCFG = AMDGPUTargetMachine::EnableLateStructurizeCFG;
126}
127
129 Subtarget = &MF.getSubtarget<GCNSubtarget>();
131 Mode = SIModeRegisterDefaults(MF.getFunction(), *Subtarget);
133}
134
135bool AMDGPUDAGToDAGISel::fp16SrcZerosHighBits(unsigned Opc) const {
136 // XXX - only need to list legal operations.
137 switch (Opc) {
138 case ISD::FADD:
139 case ISD::FSUB:
140 case ISD::FMUL:
141 case ISD::FDIV:
142 case ISD::FREM:
144 case ISD::UINT_TO_FP:
145 case ISD::SINT_TO_FP:
146 case ISD::FABS:
147 // Fabs is lowered to a bit operation, but it's an and which will clear the
148 // high bits anyway.
149 case ISD::FSQRT:
150 case ISD::FSIN:
151 case ISD::FCOS:
152 case ISD::FPOWI:
153 case ISD::FPOW:
154 case ISD::FLOG:
155 case ISD::FLOG2:
156 case ISD::FLOG10:
157 case ISD::FEXP:
158 case ISD::FEXP2:
159 case ISD::FCEIL:
160 case ISD::FTRUNC:
161 case ISD::FRINT:
162 case ISD::FNEARBYINT:
163 case ISD::FROUNDEVEN:
164 case ISD::FROUND:
165 case ISD::FFLOOR:
166 case ISD::FMINNUM:
167 case ISD::FMAXNUM:
168 case ISD::FLDEXP:
169 case AMDGPUISD::FRACT:
170 case AMDGPUISD::CLAMP:
173 case AMDGPUISD::FMIN3:
174 case AMDGPUISD::FMAX3:
175 case AMDGPUISD::FMED3:
177 case AMDGPUISD::RCP:
178 case AMDGPUISD::RSQ:
180 // On gfx10, all 16-bit instructions preserve the high bits.
181 return Subtarget->getGeneration() <= AMDGPUSubtarget::GFX9;
182 case ISD::FP_ROUND:
183 // We may select fptrunc (fma/mad) to mad_mixlo, which does not zero the
184 // high bits on gfx9.
185 // TODO: If we had the source node we could see if the source was fma/mad
187 case ISD::FMA:
188 case ISD::FMAD:
191 default:
192 // fcopysign, select and others may be lowered to 32-bit bit operations
193 // which don't zero the high bits.
194 return false;
195 }
196}
197
199#ifdef EXPENSIVE_CHECKS
200 DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
201 LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
202 for (auto &L : LI->getLoopsInPreorder()) {
203 assert(L->isLCSSAForm(DT));
204 }
205#endif
207}
208
212#ifdef EXPENSIVE_CHECKS
215#endif
217}
218
220 assert(Subtarget->d16PreservesUnusedBits());
221 MVT VT = N->getValueType(0).getSimpleVT();
222 if (VT != MVT::v2i16 && VT != MVT::v2f16)
223 return false;
224
225 SDValue Lo = N->getOperand(0);
226 SDValue Hi = N->getOperand(1);
227
228 LoadSDNode *LdHi = dyn_cast<LoadSDNode>(stripBitcast(Hi));
229
230 // build_vector lo, (load ptr) -> load_d16_hi ptr, lo
231 // build_vector lo, (zextload ptr from i8) -> load_d16_hi_u8 ptr, lo
232 // build_vector lo, (sextload ptr from i8) -> load_d16_hi_i8 ptr, lo
233
234 // Need to check for possible indirect dependencies on the other half of the
235 // vector to avoid introducing a cycle.
236 if (LdHi && Hi.hasOneUse() && !LdHi->isPredecessorOf(Lo.getNode())) {
237 SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
238
240 SDValue Ops[] = {
241 LdHi->getChain(), LdHi->getBasePtr(), TiedIn
242 };
243
244 unsigned LoadOp = AMDGPUISD::LOAD_D16_HI;
245 if (LdHi->getMemoryVT() == MVT::i8) {
246 LoadOp = LdHi->getExtensionType() == ISD::SEXTLOAD ?
248 } else {
249 assert(LdHi->getMemoryVT() == MVT::i16);
250 }
251
252 SDValue NewLoadHi =
253 CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdHi), VTList,
254 Ops, LdHi->getMemoryVT(),
255 LdHi->getMemOperand());
256
257 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadHi);
258 CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdHi, 1), NewLoadHi.getValue(1));
259 return true;
260 }
261
262 // build_vector (load ptr), hi -> load_d16_lo ptr, hi
263 // build_vector (zextload ptr from i8), hi -> load_d16_lo_u8 ptr, hi
264 // build_vector (sextload ptr from i8), hi -> load_d16_lo_i8 ptr, hi
265 LoadSDNode *LdLo = dyn_cast<LoadSDNode>(stripBitcast(Lo));
266 if (LdLo && Lo.hasOneUse()) {
267 SDValue TiedIn = getHi16Elt(Hi);
268 if (!TiedIn || LdLo->isPredecessorOf(TiedIn.getNode()))
269 return false;
270
271 SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
272 unsigned LoadOp = AMDGPUISD::LOAD_D16_LO;
273 if (LdLo->getMemoryVT() == MVT::i8) {
274 LoadOp = LdLo->getExtensionType() == ISD::SEXTLOAD ?
276 } else {
277 assert(LdLo->getMemoryVT() == MVT::i16);
278 }
279
280 TiedIn = CurDAG->getNode(ISD::BITCAST, SDLoc(N), VT, TiedIn);
281
282 SDValue Ops[] = {
283 LdLo->getChain(), LdLo->getBasePtr(), TiedIn
284 };
285
286 SDValue NewLoadLo =
287 CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdLo), VTList,
288 Ops, LdLo->getMemoryVT(),
289 LdLo->getMemOperand());
290
291 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadLo);
292 CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdLo, 1), NewLoadLo.getValue(1));
293 return true;
294 }
295
296 return false;
297}
298
300 if (!Subtarget->d16PreservesUnusedBits())
301 return;
302
304
305 bool MadeChange = false;
306 while (Position != CurDAG->allnodes_begin()) {
307 SDNode *N = &*--Position;
308 if (N->use_empty())
309 continue;
310
311 switch (N->getOpcode()) {
313 // TODO: Match load d16 from shl (extload:i16), 16
314 MadeChange |= matchLoadD16FromBuildVector(N);
315 break;
316 default:
317 break;
318 }
319 }
320
321 if (MadeChange) {
323 LLVM_DEBUG(dbgs() << "After PreProcess:\n";
324 CurDAG->dump(););
325 }
326}
327
328bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N) const {
329 if (N->isUndef())
330 return true;
331
332 const SIInstrInfo *TII = Subtarget->getInstrInfo();
333 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N))
334 return TII->isInlineConstant(C->getAPIntValue());
335
336 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N))
337 return TII->isInlineConstant(C->getValueAPF());
338
339 return false;
340}
341
342/// Determine the register class for \p OpNo
343/// \returns The register class of the virtual register that will be used for
344/// the given operand number \OpNo or NULL if the register class cannot be
345/// determined.
346const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
347 unsigned OpNo) const {
348 if (!N->isMachineOpcode()) {
349 if (N->getOpcode() == ISD::CopyToReg) {
350 Register Reg = cast<RegisterSDNode>(N->getOperand(1))->getReg();
351 if (Reg.isVirtual()) {
353 return MRI.getRegClass(Reg);
354 }
355
356 const SIRegisterInfo *TRI
357 = static_cast<const GCNSubtarget *>(Subtarget)->getRegisterInfo();
358 return TRI->getPhysRegBaseClass(Reg);
359 }
360
361 return nullptr;
362 }
363
364 switch (N->getMachineOpcode()) {
365 default: {
366 const MCInstrDesc &Desc =
367 Subtarget->getInstrInfo()->get(N->getMachineOpcode());
368 unsigned OpIdx = Desc.getNumDefs() + OpNo;
369 if (OpIdx >= Desc.getNumOperands())
370 return nullptr;
371 int RegClass = Desc.operands()[OpIdx].RegClass;
372 if (RegClass == -1)
373 return nullptr;
374
375 return Subtarget->getRegisterInfo()->getRegClass(RegClass);
376 }
377 case AMDGPU::REG_SEQUENCE: {
378 unsigned RCID = N->getConstantOperandVal(0);
379 const TargetRegisterClass *SuperRC =
380 Subtarget->getRegisterInfo()->getRegClass(RCID);
381
382 SDValue SubRegOp = N->getOperand(OpNo + 1);
383 unsigned SubRegIdx = SubRegOp->getAsZExtVal();
384 return Subtarget->getRegisterInfo()->getSubClassWithSubReg(SuperRC,
385 SubRegIdx);
386 }
387 }
388}
389
390SDNode *AMDGPUDAGToDAGISel::glueCopyToOp(SDNode *N, SDValue NewChain,
391 SDValue Glue) const {
392 SmallVector <SDValue, 8> Ops;
393 Ops.push_back(NewChain); // Replace the chain.
394 for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
395 Ops.push_back(N->getOperand(i));
396
397 Ops.push_back(Glue);
398 return CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops);
399}
400
401SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N, SDValue Val) const {
403 *static_cast<const SITargetLowering*>(getTargetLowering());
404
405 assert(N->getOperand(0).getValueType() == MVT::Other && "Expected chain");
406
407 SDValue M0 = Lowering.copyToM0(*CurDAG, N->getOperand(0), SDLoc(N), Val);
408 return glueCopyToOp(N, M0, M0.getValue(1));
409}
410
411SDNode *AMDGPUDAGToDAGISel::glueCopyToM0LDSInit(SDNode *N) const {
412 unsigned AS = cast<MemSDNode>(N)->getAddressSpace();
413 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
414 if (Subtarget->ldsRequiresM0Init())
415 return glueCopyToM0(N, CurDAG->getTargetConstant(-1, SDLoc(N), MVT::i32));
416 } else if (AS == AMDGPUAS::REGION_ADDRESS) {
418 unsigned Value = MF.getInfo<SIMachineFunctionInfo>()->getGDSSize();
419 return
420 glueCopyToM0(N, CurDAG->getTargetConstant(Value, SDLoc(N), MVT::i32));
421 }
422 return N;
423}
424
425MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm,
426 EVT VT) const {
428 AMDGPU::S_MOV_B32, DL, MVT::i32,
429 CurDAG->getTargetConstant(Imm & 0xFFFFFFFF, DL, MVT::i32));
430 SDNode *Hi =
431 CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
432 CurDAG->getTargetConstant(Imm >> 32, DL, MVT::i32));
433 const SDValue Ops[] = {
434 CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
435 SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
436 SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
437
438 return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, VT, Ops);
439}
440
441void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
442 EVT VT = N->getValueType(0);
443 unsigned NumVectorElts = VT.getVectorNumElements();
444 EVT EltVT = VT.getVectorElementType();
445 SDLoc DL(N);
446 SDValue RegClass = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
447
448 if (NumVectorElts == 1) {
449 CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT, N->getOperand(0),
450 RegClass);
451 return;
452 }
453
454 assert(NumVectorElts <= 32 && "Vectors with more than 32 elements not "
455 "supported yet");
456 // 32 = Max Num Vector Elements
457 // 2 = 2 REG_SEQUENCE operands per element (value, subreg index)
458 // 1 = Vector Register Class
459 SmallVector<SDValue, 32 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1);
460
461 bool IsGCN = CurDAG->getSubtarget().getTargetTriple().getArch() ==
463 RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
464 bool IsRegSeq = true;
465 unsigned NOps = N->getNumOperands();
466 for (unsigned i = 0; i < NOps; i++) {
467 // XXX: Why is this here?
468 if (isa<RegisterSDNode>(N->getOperand(i))) {
469 IsRegSeq = false;
470 break;
471 }
472 unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
474 RegSeqArgs[1 + (2 * i)] = N->getOperand(i);
475 RegSeqArgs[1 + (2 * i) + 1] = CurDAG->getTargetConstant(Sub, DL, MVT::i32);
476 }
477 if (NOps != NumVectorElts) {
478 // Fill in the missing undef elements if this was a scalar_to_vector.
479 assert(N->getOpcode() == ISD::SCALAR_TO_VECTOR && NOps < NumVectorElts);
480 MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
481 DL, EltVT);
482 for (unsigned i = NOps; i < NumVectorElts; ++i) {
483 unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
485 RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0);
486 RegSeqArgs[1 + (2 * i) + 1] =
487 CurDAG->getTargetConstant(Sub, DL, MVT::i32);
488 }
489 }
490
491 if (!IsRegSeq)
492 SelectCode(N);
493 CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), RegSeqArgs);
494}
495
497 unsigned int Opc = N->getOpcode();
498 if (N->isMachineOpcode()) {
499 N->setNodeId(-1);
500 return; // Already selected.
501 }
502
503 // isa<MemSDNode> almost works but is slightly too permissive for some DS
504 // intrinsics.
505 if (Opc == ISD::LOAD || Opc == ISD::STORE || isa<AtomicSDNode>(N)) {
506 N = glueCopyToM0LDSInit(N);
507 SelectCode(N);
508 return;
509 }
510
511 switch (Opc) {
512 default:
513 break;
514 // We are selecting i64 ADD here instead of custom lower it during
515 // DAG legalization, so we can fold some i64 ADDs used for address
516 // calculation into the LOAD and STORE instructions.
517 case ISD::ADDC:
518 case ISD::ADDE:
519 case ISD::SUBC:
520 case ISD::SUBE: {
521 if (N->getValueType(0) != MVT::i64)
522 break;
523
524 SelectADD_SUB_I64(N);
525 return;
526 }
527 case ISD::UADDO_CARRY:
528 case ISD::USUBO_CARRY:
529 if (N->getValueType(0) != MVT::i32)
530 break;
531
532 SelectAddcSubb(N);
533 return;
534 case ISD::UADDO:
535 case ISD::USUBO: {
536 SelectUADDO_USUBO(N);
537 return;
538 }
540 SelectFMUL_W_CHAIN(N);
541 return;
542 }
544 SelectFMA_W_CHAIN(N);
545 return;
546 }
547
549 case ISD::BUILD_VECTOR: {
550 EVT VT = N->getValueType(0);
551 unsigned NumVectorElts = VT.getVectorNumElements();
552 if (VT.getScalarSizeInBits() == 16) {
553 if (Opc == ISD::BUILD_VECTOR && NumVectorElts == 2) {
554 if (SDNode *Packed = packConstantV2I16(N, *CurDAG)) {
555 ReplaceNode(N, Packed);
556 return;
557 }
558 }
559
560 break;
561 }
562
563 assert(VT.getVectorElementType().bitsEq(MVT::i32));
564 unsigned RegClassID =
565 SIRegisterInfo::getSGPRClassForBitWidth(NumVectorElts * 32)->getID();
566 SelectBuildVector(N, RegClassID);
567 return;
568 }
569 case ISD::BUILD_PAIR: {
570 SDValue RC, SubReg0, SubReg1;
571 SDLoc DL(N);
572 if (N->getValueType(0) == MVT::i128) {
573 RC = CurDAG->getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32);
574 SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32);
575 SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32);
576 } else if (N->getValueType(0) == MVT::i64) {
577 RC = CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32);
578 SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
579 SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
580 } else {
581 llvm_unreachable("Unhandled value type for BUILD_PAIR");
582 }
583 const SDValue Ops[] = { RC, N->getOperand(0), SubReg0,
584 N->getOperand(1), SubReg1 };
585 ReplaceNode(N, CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL,
586 N->getValueType(0), Ops));
587 return;
588 }
589
590 case ISD::Constant:
591 case ISD::ConstantFP: {
592 if (N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N))
593 break;
594
595 uint64_t Imm;
596 if (ConstantFPSDNode *FP = dyn_cast<ConstantFPSDNode>(N)) {
597 Imm = FP->getValueAPF().bitcastToAPInt().getZExtValue();
598 if (AMDGPU::isValid32BitLiteral(Imm, true))
599 break;
600 } else {
601 ConstantSDNode *C = cast<ConstantSDNode>(N);
602 Imm = C->getZExtValue();
603 if (AMDGPU::isValid32BitLiteral(Imm, false))
604 break;
605 }
606
607 SDLoc DL(N);
608 ReplaceNode(N, buildSMovImm64(DL, Imm, N->getValueType(0)));
609 return;
610 }
612 case AMDGPUISD::BFE_U32: {
613 // There is a scalar version available, but unlike the vector version which
614 // has a separate operand for the offset and width, the scalar version packs
615 // the width and offset into a single operand. Try to move to the scalar
616 // version if the offsets are constant, so that we can try to keep extended
617 // loads of kernel arguments in SGPRs.
618
619 // TODO: Technically we could try to pattern match scalar bitshifts of
620 // dynamic values, but it's probably not useful.
621 ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
622 if (!Offset)
623 break;
624
625 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
626 if (!Width)
627 break;
628
629 bool Signed = Opc == AMDGPUISD::BFE_I32;
630
631 uint32_t OffsetVal = Offset->getZExtValue();
632 uint32_t WidthVal = Width->getZExtValue();
633
634 ReplaceNode(N, getBFE32(Signed, SDLoc(N), N->getOperand(0), OffsetVal,
635 WidthVal));
636 return;
637 }
639 SelectDIV_SCALE(N);
640 return;
641 }
644 SelectMAD_64_32(N);
645 return;
646 }
647 case ISD::SMUL_LOHI:
648 case ISD::UMUL_LOHI:
649 return SelectMUL_LOHI(N);
650 case ISD::CopyToReg: {
652 *static_cast<const SITargetLowering*>(getTargetLowering());
653 N = Lowering.legalizeTargetIndependentNode(N, *CurDAG);
654 break;
655 }
656 case ISD::AND:
657 case ISD::SRL:
658 case ISD::SRA:
660 if (N->getValueType(0) != MVT::i32)
661 break;
662
663 SelectS_BFE(N);
664 return;
665 case ISD::BRCOND:
666 SelectBRCOND(N);
667 return;
668 case ISD::FP_EXTEND:
669 SelectFP_EXTEND(N);
670 return;
676 // Hack around using a legal type if f16 is illegal.
677 if (N->getValueType(0) == MVT::i32) {
678 MVT NewVT = Opc == AMDGPUISD::CVT_PKRTZ_F16_F32 ? MVT::v2f16 : MVT::v2i16;
679 N = CurDAG->MorphNodeTo(N, N->getOpcode(), CurDAG->getVTList(NewVT),
680 { N->getOperand(0), N->getOperand(1) });
681 SelectCode(N);
682 return;
683 }
684
685 break;
686 }
688 SelectINTRINSIC_W_CHAIN(N);
689 return;
690 }
692 SelectINTRINSIC_WO_CHAIN(N);
693 return;
694 }
695 case ISD::INTRINSIC_VOID: {
696 SelectINTRINSIC_VOID(N);
697 return;
698 }
700 SelectWAVE_ADDRESS(N);
701 return;
702 }
703 case ISD::STACKRESTORE: {
704 SelectSTACKRESTORE(N);
705 return;
706 }
707 }
708
709 SelectCode(N);
710}
711
712bool AMDGPUDAGToDAGISel::isUniformBr(const SDNode *N) const {
713 const BasicBlock *BB = FuncInfo->MBB->getBasicBlock();
714 const Instruction *Term = BB->getTerminator();
715 return Term->getMetadata("amdgpu.uniform") ||
716 Term->getMetadata("structurizecfg.uniform");
717}
718
719bool AMDGPUDAGToDAGISel::isUnneededShiftMask(const SDNode *N,
720 unsigned ShAmtBits) const {
721 assert(N->getOpcode() == ISD::AND);
722
723 const APInt &RHS = N->getConstantOperandAPInt(1);
724 if (RHS.countr_one() >= ShAmtBits)
725 return true;
726
727 const APInt &LHSKnownZeros = CurDAG->computeKnownBits(N->getOperand(0)).Zero;
728 return (LHSKnownZeros | RHS).countr_one() >= ShAmtBits;
729}
730
732 SDValue &N0, SDValue &N1) {
733 if (Addr.getValueType() == MVT::i64 && Addr.getOpcode() == ISD::BITCAST &&
734 Addr.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
735 // As we split 64-bit `or` earlier, it's complicated pattern to match, i.e.
736 // (i64 (bitcast (v2i32 (build_vector
737 // (or (extract_vector_elt V, 0), OFFSET),
738 // (extract_vector_elt V, 1)))))
739 SDValue Lo = Addr.getOperand(0).getOperand(0);
740 if (Lo.getOpcode() == ISD::OR && DAG.isBaseWithConstantOffset(Lo)) {
741 SDValue BaseLo = Lo.getOperand(0);
742 SDValue BaseHi = Addr.getOperand(0).getOperand(1);
743 // Check that split base (Lo and Hi) are extracted from the same one.
744 if (BaseLo.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
746 BaseLo.getOperand(0) == BaseHi.getOperand(0) &&
747 // Lo is statically extracted from index 0.
748 isa<ConstantSDNode>(BaseLo.getOperand(1)) &&
749 BaseLo.getConstantOperandVal(1) == 0 &&
750 // Hi is statically extracted from index 0.
751 isa<ConstantSDNode>(BaseHi.getOperand(1)) &&
752 BaseHi.getConstantOperandVal(1) == 1) {
753 N0 = BaseLo.getOperand(0).getOperand(0);
754 N1 = Lo.getOperand(1);
755 return true;
756 }
757 }
758 }
759 return false;
760}
761
762bool AMDGPUDAGToDAGISel::isBaseWithConstantOffset64(SDValue Addr, SDValue &LHS,
763 SDValue &RHS) const {
765 LHS = Addr.getOperand(0);
766 RHS = Addr.getOperand(1);
767 return true;
768 }
769
770 if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, LHS, RHS)) {
771 assert(LHS && RHS && isa<ConstantSDNode>(RHS));
772 return true;
773 }
774
775 return false;
776}
777
779 return "AMDGPU DAG->DAG Pattern Instruction Selection";
780}
781
784 std::make_unique<AMDGPUDAGToDAGISel>(TM, TM.getOptLevel())) {}
785
789#ifdef EXPENSIVE_CHECKS
791 .getManager();
792 auto &F = MF.getFunction();
795 for (auto &L : LI.getLoopsInPreorder())
796 assert(L->isLCSSAForm(DT) && "Loop is not in LCSSA form!");
797#endif
798 return SelectionDAGISelPass::run(MF, MFAM);
799}
800
801//===----------------------------------------------------------------------===//
802// Complex Patterns
803//===----------------------------------------------------------------------===//
804
805bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
806 SDValue &Offset) {
807 return false;
808}
809
810bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
811 SDValue &Offset) {
813 SDLoc DL(Addr);
814
815 if ((C = dyn_cast<ConstantSDNode>(Addr))) {
816 Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
817 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
818 } else if ((Addr.getOpcode() == AMDGPUISD::DWORDADDR) &&
819 (C = dyn_cast<ConstantSDNode>(Addr.getOperand(0)))) {
820 Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
821 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
822 } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
823 (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) {
824 Base = Addr.getOperand(0);
825 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
826 } else {
827 Base = Addr;
828 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
829 }
830
831 return true;
832}
833
834SDValue AMDGPUDAGToDAGISel::getMaterializedScalarImm32(int64_t Val,
835 const SDLoc &DL) const {
837 AMDGPU::S_MOV_B32, DL, MVT::i32,
838 CurDAG->getTargetConstant(Val, DL, MVT::i32));
839 return SDValue(Mov, 0);
840}
841
842// FIXME: Should only handle uaddo_carry/usubo_carry
843void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
844 SDLoc DL(N);
845 SDValue LHS = N->getOperand(0);
846 SDValue RHS = N->getOperand(1);
847
848 unsigned Opcode = N->getOpcode();
849 bool ConsumeCarry = (Opcode == ISD::ADDE || Opcode == ISD::SUBE);
850 bool ProduceCarry =
851 ConsumeCarry || Opcode == ISD::ADDC || Opcode == ISD::SUBC;
852 bool IsAdd = Opcode == ISD::ADD || Opcode == ISD::ADDC || Opcode == ISD::ADDE;
853
854 SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
855 SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
856
857 SDNode *Lo0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
858 DL, MVT::i32, LHS, Sub0);
859 SDNode *Hi0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
860 DL, MVT::i32, LHS, Sub1);
861
862 SDNode *Lo1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
863 DL, MVT::i32, RHS, Sub0);
864 SDNode *Hi1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
865 DL, MVT::i32, RHS, Sub1);
866
867 SDVTList VTList = CurDAG->getVTList(MVT::i32, MVT::Glue);
868
869 static const unsigned OpcMap[2][2][2] = {
870 {{AMDGPU::S_SUB_U32, AMDGPU::S_ADD_U32},
871 {AMDGPU::V_SUB_CO_U32_e32, AMDGPU::V_ADD_CO_U32_e32}},
872 {{AMDGPU::S_SUBB_U32, AMDGPU::S_ADDC_U32},
873 {AMDGPU::V_SUBB_U32_e32, AMDGPU::V_ADDC_U32_e32}}};
874
875 unsigned Opc = OpcMap[0][N->isDivergent()][IsAdd];
876 unsigned CarryOpc = OpcMap[1][N->isDivergent()][IsAdd];
877
878 SDNode *AddLo;
879 if (!ConsumeCarry) {
880 SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0) };
881 AddLo = CurDAG->getMachineNode(Opc, DL, VTList, Args);
882 } else {
883 SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0), N->getOperand(2) };
884 AddLo = CurDAG->getMachineNode(CarryOpc, DL, VTList, Args);
885 }
886 SDValue AddHiArgs[] = {
887 SDValue(Hi0, 0),
888 SDValue(Hi1, 0),
889 SDValue(AddLo, 1)
890 };
891 SDNode *AddHi = CurDAG->getMachineNode(CarryOpc, DL, VTList, AddHiArgs);
892
893 SDValue RegSequenceArgs[] = {
894 CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
895 SDValue(AddLo,0),
896 Sub0,
897 SDValue(AddHi,0),
898 Sub1,
899 };
900 SDNode *RegSequence = CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
901 MVT::i64, RegSequenceArgs);
902
903 if (ProduceCarry) {
904 // Replace the carry-use
905 ReplaceUses(SDValue(N, 1), SDValue(AddHi, 1));
906 }
907
908 // Replace the remaining uses.
909 ReplaceNode(N, RegSequence);
910}
911
912void AMDGPUDAGToDAGISel::SelectAddcSubb(SDNode *N) {
913 SDLoc DL(N);
914 SDValue LHS = N->getOperand(0);
915 SDValue RHS = N->getOperand(1);
916 SDValue CI = N->getOperand(2);
917
918 if (N->isDivergent()) {
919 unsigned Opc = N->getOpcode() == ISD::UADDO_CARRY ? AMDGPU::V_ADDC_U32_e64
920 : AMDGPU::V_SUBB_U32_e64;
922 N, Opc, N->getVTList(),
923 {LHS, RHS, CI,
924 CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
925 } else {
926 unsigned Opc = N->getOpcode() == ISD::UADDO_CARRY ? AMDGPU::S_ADD_CO_PSEUDO
927 : AMDGPU::S_SUB_CO_PSEUDO;
928 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), {LHS, RHS, CI});
929 }
930}
931
932void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) {
933 // The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned
934 // carry out despite the _i32 name. These were renamed in VI to _U32.
935 // FIXME: We should probably rename the opcodes here.
936 bool IsAdd = N->getOpcode() == ISD::UADDO;
937 bool IsVALU = N->isDivergent();
938
939 for (SDNode::use_iterator UI = N->use_begin(), E = N->use_end(); UI != E;
940 ++UI)
941 if (UI.getUse().getResNo() == 1) {
942 if ((IsAdd && (UI->getOpcode() != ISD::UADDO_CARRY)) ||
943 (!IsAdd && (UI->getOpcode() != ISD::USUBO_CARRY))) {
944 IsVALU = true;
945 break;
946 }
947 }
948
949 if (IsVALU) {
950 unsigned Opc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
951
953 N, Opc, N->getVTList(),
954 {N->getOperand(0), N->getOperand(1),
955 CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
956 } else {
957 unsigned Opc = N->getOpcode() == ISD::UADDO ? AMDGPU::S_UADDO_PSEUDO
958 : AMDGPU::S_USUBO_PSEUDO;
959
960 CurDAG->SelectNodeTo(N, Opc, N->getVTList(),
961 {N->getOperand(0), N->getOperand(1)});
962 }
963}
964
965void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) {
966 SDLoc SL(N);
967 // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp, omod
968 SDValue Ops[10];
969
970 SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[6], Ops[7]);
971 SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
972 SelectVOP3Mods(N->getOperand(3), Ops[5], Ops[4]);
973 Ops[8] = N->getOperand(0);
974 Ops[9] = N->getOperand(4);
975
976 // If there are no source modifiers, prefer fmac over fma because it can use
977 // the smaller VOP2 encoding.
978 bool UseFMAC = Subtarget->hasDLInsts() &&
979 cast<ConstantSDNode>(Ops[0])->isZero() &&
980 cast<ConstantSDNode>(Ops[2])->isZero() &&
981 cast<ConstantSDNode>(Ops[4])->isZero();
982 unsigned Opcode = UseFMAC ? AMDGPU::V_FMAC_F32_e64 : AMDGPU::V_FMA_F32_e64;
983 CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), Ops);
984}
985
986void AMDGPUDAGToDAGISel::SelectFMUL_W_CHAIN(SDNode *N) {
987 SDLoc SL(N);
988 // src0_modifiers, src0, src1_modifiers, src1, clamp, omod
989 SDValue Ops[8];
990
991 SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[4], Ops[5]);
992 SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
993 Ops[6] = N->getOperand(0);
994 Ops[7] = N->getOperand(3);
995
996 CurDAG->SelectNodeTo(N, AMDGPU::V_MUL_F32_e64, N->getVTList(), Ops);
997}
998
999// We need to handle this here because tablegen doesn't support matching
1000// instructions with multiple outputs.
1001void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
1002 SDLoc SL(N);
1003 EVT VT = N->getValueType(0);
1004
1005 assert(VT == MVT::f32 || VT == MVT::f64);
1006
1007 unsigned Opc
1008 = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64_e64 : AMDGPU::V_DIV_SCALE_F32_e64;
1009
1010 // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp,
1011 // omod
1012 SDValue Ops[8];
1013 SelectVOP3BMods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]);
1014 SelectVOP3BMods(N->getOperand(1), Ops[3], Ops[2]);
1015 SelectVOP3BMods(N->getOperand(2), Ops[5], Ops[4]);
1016 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
1017}
1018
1019// We need to handle this here because tablegen doesn't support matching
1020// instructions with multiple outputs.
1021void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {
1022 SDLoc SL(N);
1023 bool Signed = N->getOpcode() == AMDGPUISD::MAD_I64_I32;
1024 unsigned Opc;
1025 if (Subtarget->hasMADIntraFwdBug())
1026 Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
1027 : AMDGPU::V_MAD_U64_U32_gfx11_e64;
1028 else
1029 Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
1030
1031 SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
1032 SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
1033 Clamp };
1034 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
1035}
1036
1037// We need to handle this here because tablegen doesn't support matching
1038// instructions with multiple outputs.
1039void AMDGPUDAGToDAGISel::SelectMUL_LOHI(SDNode *N) {
1040 SDLoc SL(N);
1041 bool Signed = N->getOpcode() == ISD::SMUL_LOHI;
1042 unsigned Opc;
1043 if (Subtarget->hasMADIntraFwdBug())
1044 Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
1045 : AMDGPU::V_MAD_U64_U32_gfx11_e64;
1046 else
1047 Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
1048
1049 SDValue Zero = CurDAG->getTargetConstant(0, SL, MVT::i64);
1050 SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
1051 SDValue Ops[] = {N->getOperand(0), N->getOperand(1), Zero, Clamp};
1052 SDNode *Mad = CurDAG->getMachineNode(Opc, SL, N->getVTList(), Ops);
1053 if (!SDValue(N, 0).use_empty()) {
1054 SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32);
1055 SDNode *Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL,
1056 MVT::i32, SDValue(Mad, 0), Sub0);
1057 ReplaceUses(SDValue(N, 0), SDValue(Lo, 0));
1058 }
1059 if (!SDValue(N, 1).use_empty()) {
1060 SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32);
1061 SDNode *Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL,
1062 MVT::i32, SDValue(Mad, 0), Sub1);
1063 ReplaceUses(SDValue(N, 1), SDValue(Hi, 0));
1064 }
1066}
1067
1068bool AMDGPUDAGToDAGISel::isDSOffsetLegal(SDValue Base, unsigned Offset) const {
1069 if (!isUInt<16>(Offset))
1070 return false;
1071
1072 if (!Base || Subtarget->hasUsableDSOffset() ||
1073 Subtarget->unsafeDSOffsetFoldingEnabled())
1074 return true;
1075
1076 // On Southern Islands instruction with a negative base value and an offset
1077 // don't seem to work.
1078 return CurDAG->SignBitIsZero(Base);
1079}
1080
1081bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
1082 SDValue &Offset) const {
1083 SDLoc DL(Addr);
1085 SDValue N0 = Addr.getOperand(0);
1086 SDValue N1 = Addr.getOperand(1);
1087 ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1088 if (isDSOffsetLegal(N0, C1->getSExtValue())) {
1089 // (add n0, c0)
1090 Base = N0;
1091 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
1092 return true;
1093 }
1094 } else if (Addr.getOpcode() == ISD::SUB) {
1095 // sub C, x -> add (sub 0, x), C
1096 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
1097 int64_t ByteOffset = C->getSExtValue();
1098 if (isDSOffsetLegal(SDValue(), ByteOffset)) {
1099 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1100
1101 // XXX - This is kind of hacky. Create a dummy sub node so we can check
1102 // the known bits in isDSOffsetLegal. We need to emit the selected node
1103 // here, so this is thrown away.
1104 SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32,
1105 Zero, Addr.getOperand(1));
1106
1107 if (isDSOffsetLegal(Sub, ByteOffset)) {
1109 Opnds.push_back(Zero);
1110 Opnds.push_back(Addr.getOperand(1));
1111
1112 // FIXME: Select to VOP3 version for with-carry.
1113 unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1114 if (Subtarget->hasAddNoCarry()) {
1115 SubOp = AMDGPU::V_SUB_U32_e64;
1116 Opnds.push_back(
1117 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
1118 }
1119
1120 MachineSDNode *MachineSub =
1121 CurDAG->getMachineNode(SubOp, DL, MVT::i32, Opnds);
1122
1123 Base = SDValue(MachineSub, 0);
1124 Offset = CurDAG->getTargetConstant(ByteOffset, DL, MVT::i16);
1125 return true;
1126 }
1127 }
1128 }
1129 } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1130 // If we have a constant address, prefer to put the constant into the
1131 // offset. This can save moves to load the constant address since multiple
1132 // operations can share the zero base address register, and enables merging
1133 // into read2 / write2 instructions.
1134
1135 SDLoc DL(Addr);
1136
1137 if (isDSOffsetLegal(SDValue(), CAddr->getZExtValue())) {
1138 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1139 MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
1140 DL, MVT::i32, Zero);
1141 Base = SDValue(MovZero, 0);
1142 Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16);
1143 return true;
1144 }
1145 }
1146
1147 // default case
1148 Base = Addr;
1149 Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i16);
1150 return true;
1151}
1152
1153bool AMDGPUDAGToDAGISel::isDSOffset2Legal(SDValue Base, unsigned Offset0,
1154 unsigned Offset1,
1155 unsigned Size) const {
1156 if (Offset0 % Size != 0 || Offset1 % Size != 0)
1157 return false;
1158 if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
1159 return false;
1160
1161 if (!Base || Subtarget->hasUsableDSOffset() ||
1162 Subtarget->unsafeDSOffsetFoldingEnabled())
1163 return true;
1164
1165 // On Southern Islands instruction with a negative base value and an offset
1166 // don't seem to work.
1167 return CurDAG->SignBitIsZero(Base);
1168}
1169
1170// Return whether the operation has NoUnsignedWrap property.
1172 return (Addr.getOpcode() == ISD::ADD &&
1173 Addr->getFlags().hasNoUnsignedWrap()) ||
1174 Addr->getOpcode() == ISD::OR;
1175}
1176
1177// Check that the base address of flat scratch load/store in the form of `base +
1178// offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware
1179// requirement). We always treat the first operand as the base address here.
1180bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue Addr) const {
1182 return true;
1183
1184 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1185 // values.
1186 if (Subtarget->hasSignedScratchOffsets())
1187 return true;
1188
1189 auto LHS = Addr.getOperand(0);
1190 auto RHS = Addr.getOperand(1);
1191
1192 // If the immediate offset is negative and within certain range, the base
1193 // address cannot also be negative. If the base is also negative, the sum
1194 // would be either negative or much larger than the valid range of scratch
1195 // memory a thread can access.
1196 ConstantSDNode *ImmOp = nullptr;
1197 if (Addr.getOpcode() == ISD::ADD && (ImmOp = dyn_cast<ConstantSDNode>(RHS))) {
1198 if (ImmOp->getSExtValue() < 0 && ImmOp->getSExtValue() > -0x40000000)
1199 return true;
1200 }
1201
1202 return CurDAG->SignBitIsZero(LHS);
1203}
1204
1205// Check address value in SGPR/VGPR are legal for flat scratch in the form
1206// of: SGPR + VGPR.
1207bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSV(SDValue Addr) const {
1209 return true;
1210
1211 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1212 // values.
1213 if (Subtarget->hasSignedScratchOffsets())
1214 return true;
1215
1216 auto LHS = Addr.getOperand(0);
1217 auto RHS = Addr.getOperand(1);
1218 return CurDAG->SignBitIsZero(RHS) && CurDAG->SignBitIsZero(LHS);
1219}
1220
1221// Check address value in SGPR/VGPR are legal for flat scratch in the form
1222// of: SGPR + VGPR + Imm.
1223bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSVImm(SDValue Addr) const {
1224 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1225 // values.
1226 if (AMDGPU::isGFX12Plus(*Subtarget))
1227 return true;
1228
1229 auto Base = Addr.getOperand(0);
1230 auto *RHSImm = cast<ConstantSDNode>(Addr.getOperand(1));
1231 // If the immediate offset is negative and within certain range, the base
1232 // address cannot also be negative. If the base is also negative, the sum
1233 // would be either negative or much larger than the valid range of scratch
1234 // memory a thread can access.
1235 if (isNoUnsignedWrap(Base) &&
1237 (RHSImm->getSExtValue() < 0 && RHSImm->getSExtValue() > -0x40000000)))
1238 return true;
1239
1240 auto LHS = Base.getOperand(0);
1241 auto RHS = Base.getOperand(1);
1242 return CurDAG->SignBitIsZero(RHS) && CurDAG->SignBitIsZero(LHS);
1243}
1244
1245// TODO: If offset is too big, put low 16-bit into offset.
1246bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,
1247 SDValue &Offset0,
1248 SDValue &Offset1) const {
1249 return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 4);
1250}
1251
1252bool AMDGPUDAGToDAGISel::SelectDS128Bit8ByteAligned(SDValue Addr, SDValue &Base,
1253 SDValue &Offset0,
1254 SDValue &Offset1) const {
1255 return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 8);
1256}
1257
1258bool AMDGPUDAGToDAGISel::SelectDSReadWrite2(SDValue Addr, SDValue &Base,
1259 SDValue &Offset0, SDValue &Offset1,
1260 unsigned Size) const {
1261 SDLoc DL(Addr);
1262
1264 SDValue N0 = Addr.getOperand(0);
1265 SDValue N1 = Addr.getOperand(1);
1266 ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1267 unsigned OffsetValue0 = C1->getZExtValue();
1268 unsigned OffsetValue1 = OffsetValue0 + Size;
1269
1270 // (add n0, c0)
1271 if (isDSOffset2Legal(N0, OffsetValue0, OffsetValue1, Size)) {
1272 Base = N0;
1273 Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i32);
1274 Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i32);
1275 return true;
1276 }
1277 } else if (Addr.getOpcode() == ISD::SUB) {
1278 // sub C, x -> add (sub 0, x), C
1279 if (const ConstantSDNode *C =
1280 dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
1281 unsigned OffsetValue0 = C->getZExtValue();
1282 unsigned OffsetValue1 = OffsetValue0 + Size;
1283
1284 if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {
1285 SDLoc DL(Addr);
1286 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1287
1288 // XXX - This is kind of hacky. Create a dummy sub node so we can check
1289 // the known bits in isDSOffsetLegal. We need to emit the selected node
1290 // here, so this is thrown away.
1291 SDValue Sub =
1292 CurDAG->getNode(ISD::SUB, DL, MVT::i32, Zero, Addr.getOperand(1));
1293
1294 if (isDSOffset2Legal(Sub, OffsetValue0, OffsetValue1, Size)) {
1296 Opnds.push_back(Zero);
1297 Opnds.push_back(Addr.getOperand(1));
1298 unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1299 if (Subtarget->hasAddNoCarry()) {
1300 SubOp = AMDGPU::V_SUB_U32_e64;
1301 Opnds.push_back(
1302 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
1303 }
1304
1305 MachineSDNode *MachineSub = CurDAG->getMachineNode(
1306 SubOp, DL, MVT::getIntegerVT(Size * 8), Opnds);
1307
1308 Base = SDValue(MachineSub, 0);
1309 Offset0 =
1310 CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i32);
1311 Offset1 =
1312 CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i32);
1313 return true;
1314 }
1315 }
1316 }
1317 } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1318 unsigned OffsetValue0 = CAddr->getZExtValue();
1319 unsigned OffsetValue1 = OffsetValue0 + Size;
1320
1321 if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {
1322 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1323 MachineSDNode *MovZero =
1324 CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32, Zero);
1325 Base = SDValue(MovZero, 0);
1326 Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i32);
1327 Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i32);
1328 return true;
1329 }
1330 }
1331
1332 // default case
1333
1334 Base = Addr;
1335 Offset0 = CurDAG->getTargetConstant(0, DL, MVT::i32);
1336 Offset1 = CurDAG->getTargetConstant(1, DL, MVT::i32);
1337 return true;
1338}
1339
1340bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDValue &VAddr,
1341 SDValue &SOffset, SDValue &Offset,
1342 SDValue &Offen, SDValue &Idxen,
1343 SDValue &Addr64) const {
1344 // Subtarget prefers to use flat instruction
1345 // FIXME: This should be a pattern predicate and not reach here
1346 if (Subtarget->useFlatForGlobal())
1347 return false;
1348
1349 SDLoc DL(Addr);
1350
1351 Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1);
1352 Offen = CurDAG->getTargetConstant(0, DL, MVT::i1);
1353 Addr64 = CurDAG->getTargetConstant(0, DL, MVT::i1);
1354 SOffset = Subtarget->hasRestrictedSOffset()
1355 ? CurDAG->getRegister(AMDGPU::SGPR_NULL, MVT::i32)
1356 : CurDAG->getTargetConstant(0, DL, MVT::i32);
1357
1358 ConstantSDNode *C1 = nullptr;
1359 SDValue N0 = Addr;
1361 C1 = cast<ConstantSDNode>(Addr.getOperand(1));
1362 if (isUInt<32>(C1->getZExtValue()))
1363 N0 = Addr.getOperand(0);
1364 else
1365 C1 = nullptr;
1366 }
1367
1368 if (N0.getOpcode() == ISD::ADD) {
1369 // (add N2, N3) -> addr64, or
1370 // (add (add N2, N3), C1) -> addr64
1371 SDValue N2 = N0.getOperand(0);
1372 SDValue N3 = N0.getOperand(1);
1373 Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
1374
1375 if (N2->isDivergent()) {
1376 if (N3->isDivergent()) {
1377 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
1378 // addr64, and construct the resource from a 0 address.
1379 Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
1380 VAddr = N0;
1381 } else {
1382 // N2 is divergent, N3 is not.
1383 Ptr = N3;
1384 VAddr = N2;
1385 }
1386 } else {
1387 // N2 is not divergent.
1388 Ptr = N2;
1389 VAddr = N3;
1390 }
1391 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1392 } else if (N0->isDivergent()) {
1393 // N0 is divergent. Use it as the addr64, and construct the resource from a
1394 // 0 address.
1395 Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
1396 VAddr = N0;
1397 Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
1398 } else {
1399 // N0 -> offset, or
1400 // (N0 + C1) -> offset
1401 VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32);
1402 Ptr = N0;
1403 }
1404
1405 if (!C1) {
1406 // No offset.
1407 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1408 return true;
1409 }
1410
1411 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1412 if (TII->isLegalMUBUFImmOffset(C1->getZExtValue())) {
1413 // Legal offset for instruction.
1414 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
1415 return true;
1416 }
1417
1418 // Illegal offset, store it in soffset.
1419 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1420 SOffset =
1422 AMDGPU::S_MOV_B32, DL, MVT::i32,
1423 CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32)),
1424 0);
1425 return true;
1426}
1427
1428bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
1429 SDValue &VAddr, SDValue &SOffset,
1430 SDValue &Offset) const {
1431 SDValue Ptr, Offen, Idxen, Addr64;
1432
1433 // addr64 bit was removed for volcanic islands.
1434 // FIXME: This should be a pattern predicate and not reach here
1435 if (!Subtarget->hasAddr64())
1436 return false;
1437
1438 if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
1439 return false;
1440
1441 ConstantSDNode *C = cast<ConstantSDNode>(Addr64);
1442 if (C->getSExtValue()) {
1443 SDLoc DL(Addr);
1444
1445 const SITargetLowering& Lowering =
1446 *static_cast<const SITargetLowering*>(getTargetLowering());
1447
1448 SRsrc = SDValue(Lowering.wrapAddr64Rsrc(*CurDAG, DL, Ptr), 0);
1449 return true;
1450 }
1451
1452 return false;
1453}
1454
1455std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const {
1456 SDLoc DL(N);
1457
1458 auto *FI = dyn_cast<FrameIndexSDNode>(N);
1459 SDValue TFI =
1460 FI ? CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0)) : N;
1461
1462 // We rebase the base address into an absolute stack address and hence
1463 // use constant 0 for soffset. This value must be retained until
1464 // frame elimination and eliminateFrameIndex will choose the appropriate
1465 // frame register if need be.
1466 return std::pair(TFI, CurDAG->getTargetConstant(0, DL, MVT::i32));
1467}
1468
1469bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent,
1470 SDValue Addr, SDValue &Rsrc,
1471 SDValue &VAddr, SDValue &SOffset,
1472 SDValue &ImmOffset) const {
1473
1474 SDLoc DL(Addr);
1477
1478 Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1479
1480 if (ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1481 int64_t Imm = CAddr->getSExtValue();
1482 const int64_t NullPtr =
1484 // Don't fold null pointer.
1485 if (Imm != NullPtr) {
1486 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
1487 SDValue HighBits =
1488 CurDAG->getTargetConstant(Imm & ~MaxOffset, DL, MVT::i32);
1489 MachineSDNode *MovHighBits = CurDAG->getMachineNode(
1490 AMDGPU::V_MOV_B32_e32, DL, MVT::i32, HighBits);
1491 VAddr = SDValue(MovHighBits, 0);
1492
1493 SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1494 ImmOffset = CurDAG->getTargetConstant(Imm & MaxOffset, DL, MVT::i32);
1495 return true;
1496 }
1497 }
1498
1500 // (add n0, c1)
1501
1502 SDValue N0 = Addr.getOperand(0);
1503 uint64_t C1 = Addr.getConstantOperandVal(1);
1504
1505 // Offsets in vaddr must be positive if range checking is enabled.
1506 //
1507 // The total computation of vaddr + soffset + offset must not overflow. If
1508 // vaddr is negative, even if offset is 0 the sgpr offset add will end up
1509 // overflowing.
1510 //
1511 // Prior to gfx9, MUBUF instructions with the vaddr offset enabled would
1512 // always perform a range check. If a negative vaddr base index was used,
1513 // this would fail the range check. The overall address computation would
1514 // compute a valid address, but this doesn't happen due to the range
1515 // check. For out-of-bounds MUBUF loads, a 0 is returned.
1516 //
1517 // Therefore it should be safe to fold any VGPR offset on gfx9 into the
1518 // MUBUF vaddr, but not on older subtargets which can only do this if the
1519 // sign bit is known 0.
1520 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1521 if (TII->isLegalMUBUFImmOffset(C1) &&
1522 (!Subtarget->privateMemoryResourceIsRangeChecked() ||
1523 CurDAG->SignBitIsZero(N0))) {
1524 std::tie(VAddr, SOffset) = foldFrameIndex(N0);
1525 ImmOffset = CurDAG->getTargetConstant(C1, DL, MVT::i32);
1526 return true;
1527 }
1528 }
1529
1530 // (node)
1531 std::tie(VAddr, SOffset) = foldFrameIndex(Addr);
1532 ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1533 return true;
1534}
1535
1536static bool IsCopyFromSGPR(const SIRegisterInfo &TRI, SDValue Val) {
1537 if (Val.getOpcode() != ISD::CopyFromReg)
1538 return false;
1539 auto Reg = cast<RegisterSDNode>(Val.getOperand(1))->getReg();
1540 if (!Reg.isPhysical())
1541 return false;
1542 auto RC = TRI.getPhysRegBaseClass(Reg);
1543 return RC && TRI.isSGPRClass(RC);
1544}
1545
1546bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent,
1547 SDValue Addr,
1548 SDValue &SRsrc,
1549 SDValue &SOffset,
1550 SDValue &Offset) const {
1551 const SIRegisterInfo *TRI =
1552 static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
1553 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1556 SDLoc DL(Addr);
1557
1558 // CopyFromReg <sgpr>
1559 if (IsCopyFromSGPR(*TRI, Addr)) {
1560 SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1561 SOffset = Addr;
1562 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1563 return true;
1564 }
1565
1566 ConstantSDNode *CAddr;
1567 if (Addr.getOpcode() == ISD::ADD) {
1568 // Add (CopyFromReg <sgpr>) <constant>
1569 CAddr = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
1570 if (!CAddr || !TII->isLegalMUBUFImmOffset(CAddr->getZExtValue()))
1571 return false;
1572 if (!IsCopyFromSGPR(*TRI, Addr.getOperand(0)))
1573 return false;
1574
1575 SOffset = Addr.getOperand(0);
1576 } else if ((CAddr = dyn_cast<ConstantSDNode>(Addr)) &&
1577 TII->isLegalMUBUFImmOffset(CAddr->getZExtValue())) {
1578 // <constant>
1579 SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1580 } else {
1581 return false;
1582 }
1583
1584 SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1585
1586 Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i32);
1587 return true;
1588}
1589
1590bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
1591 SDValue &SOffset, SDValue &Offset
1592 ) const {
1593 SDValue Ptr, VAddr, Offen, Idxen, Addr64;
1594 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1595
1596 if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
1597 return false;
1598
1599 if (!cast<ConstantSDNode>(Offen)->getSExtValue() &&
1600 !cast<ConstantSDNode>(Idxen)->getSExtValue() &&
1601 !cast<ConstantSDNode>(Addr64)->getSExtValue()) {
1602 uint64_t Rsrc = TII->getDefaultRsrcDataFormat() |
1603 APInt::getAllOnes(32).getZExtValue(); // Size
1604 SDLoc DL(Addr);
1605
1606 const SITargetLowering& Lowering =
1607 *static_cast<const SITargetLowering*>(getTargetLowering());
1608
1609 SRsrc = SDValue(Lowering.buildRSRC(*CurDAG, DL, Ptr, 0, Rsrc), 0);
1610 return true;
1611 }
1612 return false;
1613}
1614
1615bool AMDGPUDAGToDAGISel::SelectBUFSOffset(SDValue ByteOffsetNode,
1616 SDValue &SOffset) const {
1617 if (Subtarget->hasRestrictedSOffset() && isNullConstant(ByteOffsetNode)) {
1618 SOffset = CurDAG->getRegister(AMDGPU::SGPR_NULL, MVT::i32);
1619 return true;
1620 }
1621
1622 SOffset = ByteOffsetNode;
1623 return true;
1624}
1625
1626// Find a load or store from corresponding pattern root.
1627// Roots may be build_vector, bitconvert or their combinations.
1630 if (MemSDNode *MN = dyn_cast<MemSDNode>(N))
1631 return MN;
1632 assert(isa<BuildVectorSDNode>(N));
1633 for (SDValue V : N->op_values())
1634 if (MemSDNode *MN =
1635 dyn_cast<MemSDNode>(AMDGPUTargetLowering::stripBitcast(V)))
1636 return MN;
1637 llvm_unreachable("cannot find MemSDNode in the pattern!");
1638}
1639
1640bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr,
1641 SDValue &VAddr, SDValue &Offset,
1642 uint64_t FlatVariant) const {
1643 int64_t OffsetVal = 0;
1644
1645 unsigned AS = findMemSDNode(N)->getAddressSpace();
1646
1647 bool CanHaveFlatSegmentOffsetBug =
1648 Subtarget->hasFlatSegmentOffsetBug() &&
1649 FlatVariant == SIInstrFlags::FLAT &&
1651
1652 if (Subtarget->hasFlatInstOffsets() && !CanHaveFlatSegmentOffsetBug) {
1653 SDValue N0, N1;
1654 if (isBaseWithConstantOffset64(Addr, N0, N1) &&
1655 (FlatVariant != SIInstrFlags::FlatScratch ||
1656 isFlatScratchBaseLegal(Addr))) {
1657 int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
1658
1659 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1660 if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) {
1661 Addr = N0;
1662 OffsetVal = COffsetVal;
1663 } else {
1664 // If the offset doesn't fit, put the low bits into the offset field and
1665 // add the rest.
1666 //
1667 // For a FLAT instruction the hardware decides whether to access
1668 // global/scratch/shared memory based on the high bits of vaddr,
1669 // ignoring the offset field, so we have to ensure that when we add
1670 // remainder to vaddr it still points into the same underlying object.
1671 // The easiest way to do that is to make sure that we split the offset
1672 // into two pieces that are both >= 0 or both <= 0.
1673
1674 SDLoc DL(N);
1675 uint64_t RemainderOffset;
1676
1677 std::tie(OffsetVal, RemainderOffset) =
1678 TII->splitFlatOffset(COffsetVal, AS, FlatVariant);
1679
1680 SDValue AddOffsetLo =
1681 getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
1682 SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
1683
1684 if (Addr.getValueType().getSizeInBits() == 32) {
1686 Opnds.push_back(N0);
1687 Opnds.push_back(AddOffsetLo);
1688 unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32;
1689 if (Subtarget->hasAddNoCarry()) {
1690 AddOp = AMDGPU::V_ADD_U32_e64;
1691 Opnds.push_back(Clamp);
1692 }
1693 Addr = SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0);
1694 } else {
1695 // TODO: Should this try to use a scalar add pseudo if the base address
1696 // is uniform and saddr is usable?
1697 SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
1698 SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
1699
1700 SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1701 DL, MVT::i32, N0, Sub0);
1702 SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1703 DL, MVT::i32, N0, Sub1);
1704
1705 SDValue AddOffsetHi =
1706 getMaterializedScalarImm32(Hi_32(RemainderOffset), DL);
1707
1708 SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1);
1709
1710 SDNode *Add =
1711 CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs,
1712 {AddOffsetLo, SDValue(N0Lo, 0), Clamp});
1713
1714 SDNode *Addc = CurDAG->getMachineNode(
1715 AMDGPU::V_ADDC_U32_e64, DL, VTs,
1716 {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp});
1717
1718 SDValue RegSequenceArgs[] = {
1719 CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32),
1720 SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1};
1721
1722 Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
1723 MVT::i64, RegSequenceArgs),
1724 0);
1725 }
1726 }
1727 }
1728 }
1729
1730 VAddr = Addr;
1731 Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i32);
1732 return true;
1733}
1734
1735bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N, SDValue Addr,
1736 SDValue &VAddr,
1737 SDValue &Offset) const {
1738 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, SIInstrFlags::FLAT);
1739}
1740
1741bool AMDGPUDAGToDAGISel::SelectGlobalOffset(SDNode *N, SDValue Addr,
1742 SDValue &VAddr,
1743 SDValue &Offset) const {
1744 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, SIInstrFlags::FlatGlobal);
1745}
1746
1747bool AMDGPUDAGToDAGISel::SelectScratchOffset(SDNode *N, SDValue Addr,
1748 SDValue &VAddr,
1749 SDValue &Offset) const {
1750 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset,
1752}
1753
1754// If this matches zero_extend i32:x, return x
1756 if (Op.getOpcode() != ISD::ZERO_EXTEND)
1757 return SDValue();
1758
1759 SDValue ExtSrc = Op.getOperand(0);
1760 return (ExtSrc.getValueType() == MVT::i32) ? ExtSrc : SDValue();
1761}
1762
1763// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
1764bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
1765 SDValue Addr,
1766 SDValue &SAddr,
1767 SDValue &VOffset,
1768 SDValue &Offset) const {
1769 int64_t ImmOffset = 0;
1770
1771 // Match the immediate offset first, which canonically is moved as low as
1772 // possible.
1773
1774 SDValue LHS, RHS;
1775 if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
1776 int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
1777 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1778
1779 if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS,
1781 Addr = LHS;
1782 ImmOffset = COffsetVal;
1783 } else if (!LHS->isDivergent()) {
1784 if (COffsetVal > 0) {
1785 SDLoc SL(N);
1786 // saddr + large_offset -> saddr +
1787 // (voffset = large_offset & ~MaxOffset) +
1788 // (large_offset & MaxOffset);
1789 int64_t SplitImmOffset, RemainderOffset;
1790 std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
1792
1793 if (isUInt<32>(RemainderOffset)) {
1794 SDNode *VMov = CurDAG->getMachineNode(
1795 AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
1796 CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
1797 VOffset = SDValue(VMov, 0);
1798 SAddr = LHS;
1799 Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i32);
1800 return true;
1801 }
1802 }
1803
1804 // We are adding a 64 bit SGPR and a constant. If constant bus limit
1805 // is 1 we would need to perform 1 or 2 extra moves for each half of
1806 // the constant and it is better to do a scalar add and then issue a
1807 // single VALU instruction to materialize zero. Otherwise it is less
1808 // instructions to perform VALU adds with immediates or inline literals.
1809 unsigned NumLiterals =
1810 !TII->isInlineConstant(APInt(32, COffsetVal & 0xffffffff)) +
1811 !TII->isInlineConstant(APInt(32, COffsetVal >> 32));
1812 if (Subtarget->getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
1813 return false;
1814 }
1815 }
1816
1817 // Match the variable offset.
1818 if (Addr.getOpcode() == ISD::ADD) {
1819 LHS = Addr.getOperand(0);
1820 RHS = Addr.getOperand(1);
1821
1822 if (!LHS->isDivergent()) {
1823 // add (i64 sgpr), (zero_extend (i32 vgpr))
1824 if (SDValue ZextRHS = matchZExtFromI32(RHS)) {
1825 SAddr = LHS;
1826 VOffset = ZextRHS;
1827 }
1828 }
1829
1830 if (!SAddr && !RHS->isDivergent()) {
1831 // add (zero_extend (i32 vgpr)), (i64 sgpr)
1832 if (SDValue ZextLHS = matchZExtFromI32(LHS)) {
1833 SAddr = RHS;
1834 VOffset = ZextLHS;
1835 }
1836 }
1837
1838 if (SAddr) {
1839 Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i32);
1840 return true;
1841 }
1842 }
1843
1844 if (Addr->isDivergent() || Addr.getOpcode() == ISD::UNDEF ||
1845 isa<ConstantSDNode>(Addr))
1846 return false;
1847
1848 // It's cheaper to materialize a single 32-bit zero for vaddr than the two
1849 // moves required to copy a 64-bit SGPR to VGPR.
1850 SAddr = Addr;
1851 SDNode *VMov =
1852 CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, SDLoc(Addr), MVT::i32,
1853 CurDAG->getTargetConstant(0, SDLoc(), MVT::i32));
1854 VOffset = SDValue(VMov, 0);
1855 Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i32);
1856 return true;
1857}
1858
1860 if (auto FI = dyn_cast<FrameIndexSDNode>(SAddr)) {
1861 SAddr = CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0));
1862 } else if (SAddr.getOpcode() == ISD::ADD &&
1863 isa<FrameIndexSDNode>(SAddr.getOperand(0))) {
1864 // Materialize this into a scalar move for scalar address to avoid
1865 // readfirstlane.
1866 auto FI = cast<FrameIndexSDNode>(SAddr.getOperand(0));
1867 SDValue TFI = CurDAG->getTargetFrameIndex(FI->getIndex(),
1868 FI->getValueType(0));
1869 SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, SDLoc(SAddr),
1870 MVT::i32, TFI, SAddr.getOperand(1)),
1871 0);
1872 }
1873
1874 return SAddr;
1875}
1876
1877// Match (32-bit SGPR base) + sext(imm offset)
1878bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr,
1879 SDValue &SAddr,
1880 SDValue &Offset) const {
1881 if (Addr->isDivergent())
1882 return false;
1883
1884 SDLoc DL(Addr);
1885
1886 int64_t COffsetVal = 0;
1887
1888 if (CurDAG->isBaseWithConstantOffset(Addr) && isFlatScratchBaseLegal(Addr)) {
1889 COffsetVal = cast<ConstantSDNode>(Addr.getOperand(1))->getSExtValue();
1890 SAddr = Addr.getOperand(0);
1891 } else {
1892 SAddr = Addr;
1893 }
1894
1895 SAddr = SelectSAddrFI(CurDAG, SAddr);
1896
1897 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1898
1899 if (!TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS,
1901 int64_t SplitImmOffset, RemainderOffset;
1902 std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
1904
1905 COffsetVal = SplitImmOffset;
1906
1907 SDValue AddOffset =
1909 ? getMaterializedScalarImm32(Lo_32(RemainderOffset), DL)
1910 : CurDAG->getTargetConstant(RemainderOffset, DL, MVT::i32);
1911 SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, DL, MVT::i32,
1912 SAddr, AddOffset),
1913 0);
1914 }
1915
1916 Offset = CurDAG->getTargetConstant(COffsetVal, DL, MVT::i16);
1917
1918 return true;
1919}
1920
1921// Check whether the flat scratch SVS swizzle bug affects this access.
1922bool AMDGPUDAGToDAGISel::checkFlatScratchSVSSwizzleBug(
1923 SDValue VAddr, SDValue SAddr, uint64_t ImmOffset) const {
1924 if (!Subtarget->hasFlatScratchSVSSwizzleBug())
1925 return false;
1926
1927 // The bug affects the swizzling of SVS accesses if there is any carry out
1928 // from the two low order bits (i.e. from bit 1 into bit 2) when adding
1929 // voffset to (soffset + inst_offset).
1930 KnownBits VKnown = CurDAG->computeKnownBits(VAddr);
1931 KnownBits SKnown =
1933 KnownBits::makeConstant(APInt(32, ImmOffset)));
1934 uint64_t VMax = VKnown.getMaxValue().getZExtValue();
1936 return (VMax & 3) + (SMax & 3) >= 4;
1937}
1938
1939bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
1940 SDValue &VAddr, SDValue &SAddr,
1941 SDValue &Offset) const {
1942 int64_t ImmOffset = 0;
1943
1944 SDValue LHS, RHS;
1945 SDValue OrigAddr = Addr;
1946 if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
1947 int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
1948 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1949
1950 if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, true)) {
1951 Addr = LHS;
1952 ImmOffset = COffsetVal;
1953 } else if (!LHS->isDivergent() && COffsetVal > 0) {
1954 SDLoc SL(N);
1955 // saddr + large_offset -> saddr + (vaddr = large_offset & ~MaxOffset) +
1956 // (large_offset & MaxOffset);
1957 int64_t SplitImmOffset, RemainderOffset;
1958 std::tie(SplitImmOffset, RemainderOffset)
1959 = TII->splitFlatOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, true);
1960
1961 if (isUInt<32>(RemainderOffset)) {
1962 SDNode *VMov = CurDAG->getMachineNode(
1963 AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
1964 CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
1965 VAddr = SDValue(VMov, 0);
1966 SAddr = LHS;
1967 if (!isFlatScratchBaseLegal(Addr))
1968 return false;
1969 if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, SplitImmOffset))
1970 return false;
1971 Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i16);
1972 return true;
1973 }
1974 }
1975 }
1976
1977 if (Addr.getOpcode() != ISD::ADD)
1978 return false;
1979
1980 LHS = Addr.getOperand(0);
1981 RHS = Addr.getOperand(1);
1982
1983 if (!LHS->isDivergent() && RHS->isDivergent()) {
1984 SAddr = LHS;
1985 VAddr = RHS;
1986 } else if (!RHS->isDivergent() && LHS->isDivergent()) {
1987 SAddr = RHS;
1988 VAddr = LHS;
1989 } else {
1990 return false;
1991 }
1992
1993 if (OrigAddr != Addr) {
1994 if (!isFlatScratchBaseLegalSVImm(OrigAddr))
1995 return false;
1996 } else {
1997 if (!isFlatScratchBaseLegalSV(OrigAddr))
1998 return false;
1999 }
2000
2001 if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, ImmOffset))
2002 return false;
2003 SAddr = SelectSAddrFI(CurDAG, SAddr);
2004 Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16);
2005 return true;
2006}
2007
2008// For unbuffered smem loads, it is illegal for the Immediate Offset to be
2009// negative if the resulting (Offset + (M0 or SOffset or zero) is negative.
2010// Handle the case where the Immediate Offset + SOffset is negative.
2011bool AMDGPUDAGToDAGISel::isSOffsetLegalWithImmOffset(SDValue *SOffset,
2012 bool Imm32Only,
2013 bool IsBuffer,
2014 int64_t ImmOffset) const {
2015 if (!IsBuffer && !Imm32Only && ImmOffset < 0 &&
2016 AMDGPU::hasSMRDSignedImmOffset(*Subtarget)) {
2017 KnownBits SKnown = CurDAG->computeKnownBits(*SOffset);
2018 if (ImmOffset + SKnown.getMinValue().getSExtValue() < 0)
2019 return false;
2020 }
2021
2022 return true;
2023}
2024
2025// Match an immediate (if Offset is not null) or an SGPR (if SOffset is
2026// not null) offset. If Imm32Only is true, match only 32-bit immediate
2027// offsets available on CI.
2028bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
2029 SDValue *SOffset, SDValue *Offset,
2030 bool Imm32Only, bool IsBuffer,
2031 bool HasSOffset,
2032 int64_t ImmOffset) const {
2033 assert((!SOffset || !Offset) &&
2034 "Cannot match both soffset and offset at the same time!");
2035
2036 ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode);
2037 if (!C) {
2038 if (!SOffset)
2039 return false;
2040
2041 if (ByteOffsetNode.getValueType().isScalarInteger() &&
2042 ByteOffsetNode.getValueType().getSizeInBits() == 32) {
2043 *SOffset = ByteOffsetNode;
2044 return isSOffsetLegalWithImmOffset(SOffset, Imm32Only, IsBuffer,
2045 ImmOffset);
2046 }
2047 if (ByteOffsetNode.getOpcode() == ISD::ZERO_EXTEND) {
2048 if (ByteOffsetNode.getOperand(0).getValueType().getSizeInBits() == 32) {
2049 *SOffset = ByteOffsetNode.getOperand(0);
2050 return isSOffsetLegalWithImmOffset(SOffset, Imm32Only, IsBuffer,
2051 ImmOffset);
2052 }
2053 }
2054 return false;
2055 }
2056
2057 SDLoc SL(ByteOffsetNode);
2058
2059 // GFX9 and GFX10 have signed byte immediate offsets. The immediate
2060 // offset for S_BUFFER instructions is unsigned.
2061 int64_t ByteOffset = IsBuffer ? C->getZExtValue() : C->getSExtValue();
2062 std::optional<int64_t> EncodedOffset = AMDGPU::getSMRDEncodedOffset(
2063 *Subtarget, ByteOffset, IsBuffer, HasSOffset);
2064 if (EncodedOffset && Offset && !Imm32Only) {
2065 *Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);
2066 return true;
2067 }
2068
2069 // SGPR and literal offsets are unsigned.
2070 if (ByteOffset < 0)
2071 return false;
2072
2073 EncodedOffset = AMDGPU::getSMRDEncodedLiteralOffset32(*Subtarget, ByteOffset);
2074 if (EncodedOffset && Offset && Imm32Only) {
2075 *Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);
2076 return true;
2077 }
2078
2079 if (!isUInt<32>(ByteOffset) && !isInt<32>(ByteOffset))
2080 return false;
2081
2082 if (SOffset) {
2083 SDValue C32Bit = CurDAG->getTargetConstant(ByteOffset, SL, MVT::i32);
2084 *SOffset = SDValue(
2085 CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, C32Bit), 0);
2086 return true;
2087 }
2088
2089 return false;
2090}
2091
2092SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const {
2093 if (Addr.getValueType() != MVT::i32)
2094 return Addr;
2095
2096 // Zero-extend a 32-bit address.
2097 SDLoc SL(Addr);
2098
2101 unsigned AddrHiVal = Info->get32BitAddressHighBits();
2102 SDValue AddrHi = CurDAG->getTargetConstant(AddrHiVal, SL, MVT::i32);
2103
2104 const SDValue Ops[] = {
2105 CurDAG->getTargetConstant(AMDGPU::SReg_64_XEXECRegClassID, SL, MVT::i32),
2106 Addr,
2107 CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
2108 SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, AddrHi),
2109 0),
2110 CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32),
2111 };
2112
2113 return SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, SL, MVT::i64,
2114 Ops), 0);
2115}
2116
2117// Match a base and an immediate (if Offset is not null) or an SGPR (if
2118// SOffset is not null) or an immediate+SGPR offset. If Imm32Only is
2119// true, match only 32-bit immediate offsets available on CI.
2120bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase,
2121 SDValue *SOffset, SDValue *Offset,
2122 bool Imm32Only, bool IsBuffer,
2123 bool HasSOffset,
2124 int64_t ImmOffset) const {
2125 if (SOffset && Offset) {
2126 assert(!Imm32Only && !IsBuffer);
2127 SDValue B;
2128
2129 if (!SelectSMRDBaseOffset(Addr, B, nullptr, Offset, false, false, true))
2130 return false;
2131
2132 int64_t ImmOff = 0;
2133 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(*Offset))
2134 ImmOff = C->getSExtValue();
2135
2136 return SelectSMRDBaseOffset(B, SBase, SOffset, nullptr, false, false, true,
2137 ImmOff);
2138 }
2139
2140 // A 32-bit (address + offset) should not cause unsigned 32-bit integer
2141 // wraparound, because s_load instructions perform the addition in 64 bits.
2142 if (Addr.getValueType() == MVT::i32 && Addr.getOpcode() == ISD::ADD &&
2143 !Addr->getFlags().hasNoUnsignedWrap())
2144 return false;
2145
2146 SDValue N0, N1;
2147 // Extract the base and offset if possible.
2148 if (CurDAG->isBaseWithConstantOffset(Addr) || Addr.getOpcode() == ISD::ADD) {
2149 N0 = Addr.getOperand(0);
2150 N1 = Addr.getOperand(1);
2151 } else if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, N0, N1)) {
2152 assert(N0 && N1 && isa<ConstantSDNode>(N1));
2153 }
2154 if (!N0 || !N1)
2155 return false;
2156
2157 if (SelectSMRDOffset(N1, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
2158 ImmOffset)) {
2159 SBase = N0;
2160 return true;
2161 }
2162 if (SelectSMRDOffset(N0, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
2163 ImmOffset)) {
2164 SBase = N1;
2165 return true;
2166 }
2167 return false;
2168}
2169
2170bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase,
2171 SDValue *SOffset, SDValue *Offset,
2172 bool Imm32Only) const {
2173 if (SelectSMRDBaseOffset(Addr, SBase, SOffset, Offset, Imm32Only)) {
2174 SBase = Expand32BitAddress(SBase);
2175 return true;
2176 }
2177
2178 if (Addr.getValueType() == MVT::i32 && Offset && !SOffset) {
2179 SBase = Expand32BitAddress(Addr);
2180 *Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
2181 return true;
2182 }
2183
2184 return false;
2185}
2186
2187bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr, SDValue &SBase,
2188 SDValue &Offset) const {
2189 return SelectSMRD(Addr, SBase, /* SOffset */ nullptr, &Offset);
2190}
2191
2192bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase,
2193 SDValue &Offset) const {
2195 return SelectSMRD(Addr, SBase, /* SOffset */ nullptr, &Offset,
2196 /* Imm32Only */ true);
2197}
2198
2199bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDValue Addr, SDValue &SBase,
2200 SDValue &SOffset) const {
2201 return SelectSMRD(Addr, SBase, &SOffset, /* Offset */ nullptr);
2202}
2203
2204bool AMDGPUDAGToDAGISel::SelectSMRDSgprImm(SDValue Addr, SDValue &SBase,
2205 SDValue &SOffset,
2206 SDValue &Offset) const {
2207 return SelectSMRD(Addr, SBase, &SOffset, &Offset);
2208}
2209
2210bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue N, SDValue &Offset) const {
2211 return SelectSMRDOffset(N, /* SOffset */ nullptr, &Offset,
2212 /* Imm32Only */ false, /* IsBuffer */ true);
2213}
2214
2215bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue N,
2216 SDValue &Offset) const {
2218 return SelectSMRDOffset(N, /* SOffset */ nullptr, &Offset,
2219 /* Imm32Only */ true, /* IsBuffer */ true);
2220}
2221
2222bool AMDGPUDAGToDAGISel::SelectSMRDBufferSgprImm(SDValue N, SDValue &SOffset,
2223 SDValue &Offset) const {
2224 // Match the (soffset + offset) pair as a 32-bit register base and
2225 // an immediate offset.
2226 return N.getValueType() == MVT::i32 &&
2227 SelectSMRDBaseOffset(N, /* SBase */ SOffset, /* SOffset*/ nullptr,
2228 &Offset, /* Imm32Only */ false,
2229 /* IsBuffer */ true);
2230}
2231
2232bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index,
2233 SDValue &Base,
2234 SDValue &Offset) const {
2235 SDLoc DL(Index);
2236
2238 SDValue N0 = Index.getOperand(0);
2239 SDValue N1 = Index.getOperand(1);
2240 ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
2241
2242 // (add n0, c0)
2243 // Don't peel off the offset (c0) if doing so could possibly lead
2244 // the base (n0) to be negative.
2245 // (or n0, |c0|) can never change a sign given isBaseWithConstantOffset.
2246 if (C1->getSExtValue() <= 0 || CurDAG->SignBitIsZero(N0) ||
2247 (Index->getOpcode() == ISD::OR && C1->getSExtValue() >= 0)) {
2248 Base = N0;
2249 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
2250 return true;
2251 }
2252 }
2253
2254 if (isa<ConstantSDNode>(Index))
2255 return false;
2256
2257 Base = Index;
2258 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
2259 return true;
2260}
2261
2262SDNode *AMDGPUDAGToDAGISel::getBFE32(bool IsSigned, const SDLoc &DL,
2263 SDValue Val, uint32_t Offset,
2264 uint32_t Width) {
2265 if (Val->isDivergent()) {
2266 unsigned Opcode = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2268 SDValue W = CurDAG->getTargetConstant(Width, DL, MVT::i32);
2269
2270 return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, Off, W);
2271 }
2272 unsigned Opcode = IsSigned ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2273 // Transformation function, pack the offset and width of a BFE into
2274 // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
2275 // source, bits [5:0] contain the offset and bits [22:16] the width.
2276 uint32_t PackedVal = Offset | (Width << 16);
2277 SDValue PackedConst = CurDAG->getTargetConstant(PackedVal, DL, MVT::i32);
2278
2279 return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, PackedConst);
2280}
2281
2282void AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode *N) {
2283 // "(a << b) srl c)" ---> "BFE_U32 a, (c-b), (32-c)
2284 // "(a << b) sra c)" ---> "BFE_I32 a, (c-b), (32-c)
2285 // Predicate: 0 < b <= c < 32
2286
2287 const SDValue &Shl = N->getOperand(0);
2288 ConstantSDNode *B = dyn_cast<ConstantSDNode>(Shl->getOperand(1));
2289 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
2290
2291 if (B && C) {
2292 uint32_t BVal = B->getZExtValue();
2293 uint32_t CVal = C->getZExtValue();
2294
2295 if (0 < BVal && BVal <= CVal && CVal < 32) {
2296 bool Signed = N->getOpcode() == ISD::SRA;
2297 ReplaceNode(N, getBFE32(Signed, SDLoc(N), Shl.getOperand(0), CVal - BVal,
2298 32 - CVal));
2299 return;
2300 }
2301 }
2302 SelectCode(N);
2303}
2304
2305void AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) {
2306 switch (N->getOpcode()) {
2307 case ISD::AND:
2308 if (N->getOperand(0).getOpcode() == ISD::SRL) {
2309 // "(a srl b) & mask" ---> "BFE_U32 a, b, popcount(mask)"
2310 // Predicate: isMask(mask)
2311 const SDValue &Srl = N->getOperand(0);
2312 ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(Srl.getOperand(1));
2313 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(N->getOperand(1));
2314
2315 if (Shift && Mask) {
2316 uint32_t ShiftVal = Shift->getZExtValue();
2317 uint32_t MaskVal = Mask->getZExtValue();
2318
2319 if (isMask_32(MaskVal)) {
2320 uint32_t WidthVal = llvm::popcount(MaskVal);
2321 ReplaceNode(N, getBFE32(false, SDLoc(N), Srl.getOperand(0), ShiftVal,
2322 WidthVal));
2323 return;
2324 }
2325 }
2326 }
2327 break;
2328 case ISD::SRL:
2329 if (N->getOperand(0).getOpcode() == ISD::AND) {
2330 // "(a & mask) srl b)" ---> "BFE_U32 a, b, popcount(mask >> b)"
2331 // Predicate: isMask(mask >> b)
2332 const SDValue &And = N->getOperand(0);
2333 ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(N->getOperand(1));
2334 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(And->getOperand(1));
2335
2336 if (Shift && Mask) {
2337 uint32_t ShiftVal = Shift->getZExtValue();
2338 uint32_t MaskVal = Mask->getZExtValue() >> ShiftVal;
2339
2340 if (isMask_32(MaskVal)) {
2341 uint32_t WidthVal = llvm::popcount(MaskVal);
2342 ReplaceNode(N, getBFE32(false, SDLoc(N), And.getOperand(0), ShiftVal,
2343 WidthVal));
2344 return;
2345 }
2346 }
2347 } else if (N->getOperand(0).getOpcode() == ISD::SHL) {
2348 SelectS_BFEFromShifts(N);
2349 return;
2350 }
2351 break;
2352 case ISD::SRA:
2353 if (N->getOperand(0).getOpcode() == ISD::SHL) {
2354 SelectS_BFEFromShifts(N);
2355 return;
2356 }
2357 break;
2358
2360 // sext_inreg (srl x, 16), i8 -> bfe_i32 x, 16, 8
2361 SDValue Src = N->getOperand(0);
2362 if (Src.getOpcode() != ISD::SRL)
2363 break;
2364
2365 const ConstantSDNode *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(1));
2366 if (!Amt)
2367 break;
2368
2369 unsigned Width = cast<VTSDNode>(N->getOperand(1))->getVT().getSizeInBits();
2370 ReplaceNode(N, getBFE32(true, SDLoc(N), Src.getOperand(0),
2371 Amt->getZExtValue(), Width));
2372 return;
2373 }
2374 }
2375
2376 SelectCode(N);
2377}
2378
2379bool AMDGPUDAGToDAGISel::isCBranchSCC(const SDNode *N) const {
2380 assert(N->getOpcode() == ISD::BRCOND);
2381 if (!N->hasOneUse())
2382 return false;
2383
2384 SDValue Cond = N->getOperand(1);
2385 if (Cond.getOpcode() == ISD::CopyToReg)
2386 Cond = Cond.getOperand(2);
2387
2388 if (Cond.getOpcode() != ISD::SETCC || !Cond.hasOneUse())
2389 return false;
2390
2391 MVT VT = Cond.getOperand(0).getSimpleValueType();
2392 if (VT == MVT::i32)
2393 return true;
2394
2395 if (VT == MVT::i64) {
2396 auto ST = static_cast<const GCNSubtarget *>(Subtarget);
2397
2398 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
2399 return (CC == ISD::SETEQ || CC == ISD::SETNE) && ST->hasScalarCompareEq64();
2400 }
2401
2402 return false;
2403}
2404
2405static SDValue combineBallotPattern(SDValue VCMP, bool &Negate) {
2406 assert(VCMP->getOpcode() == AMDGPUISD::SETCC);
2407 // Special case for amdgcn.ballot:
2408 // %Cond = i1 (and/or combination of i1 ISD::SETCCs)
2409 // %VCMP = i(WaveSize) AMDGPUISD::SETCC (ext %Cond), 0, setne/seteq
2410 // =>
2411 // Use i1 %Cond value instead of i(WaveSize) %VCMP.
2412 // This is possible because divergent ISD::SETCC is selected as V_CMP and
2413 // Cond becomes a i(WaveSize) full mask value.
2414 // Note that ballot doesn't use SETEQ condition but its easy to support it
2415 // here for completeness, so in this case Negate is set true on return.
2416 auto VCMP_CC = cast<CondCodeSDNode>(VCMP.getOperand(2))->get();
2417 if ((VCMP_CC == ISD::SETEQ || VCMP_CC == ISD::SETNE) &&
2418 isNullConstant(VCMP.getOperand(1))) {
2419
2420 auto Cond = VCMP.getOperand(0);
2421 if (ISD::isExtOpcode(Cond->getOpcode())) // Skip extension.
2422 Cond = Cond.getOperand(0);
2423
2424 if (isBoolSGPR(Cond)) {
2425 Negate = VCMP_CC == ISD::SETEQ;
2426 return Cond;
2427 }
2428 }
2429 return SDValue();
2430}
2431
2432void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
2433 SDValue Cond = N->getOperand(1);
2434
2435 if (Cond.isUndef()) {
2436 CurDAG->SelectNodeTo(N, AMDGPU::SI_BR_UNDEF, MVT::Other,
2437 N->getOperand(2), N->getOperand(0));
2438 return;
2439 }
2440
2441 const GCNSubtarget *ST = static_cast<const GCNSubtarget *>(Subtarget);
2442 const SIRegisterInfo *TRI = ST->getRegisterInfo();
2443
2444 bool UseSCCBr = isCBranchSCC(N) && isUniformBr(N);
2445 bool AndExec = !UseSCCBr;
2446 bool Negate = false;
2447
2448 if (Cond.getOpcode() == ISD::SETCC &&
2449 Cond->getOperand(0)->getOpcode() == AMDGPUISD::SETCC) {
2450 SDValue VCMP = Cond->getOperand(0);
2451 auto CC = cast<CondCodeSDNode>(Cond->getOperand(2))->get();
2452 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
2453 isNullConstant(Cond->getOperand(1)) &&
2454 // We may encounter ballot.i64 in wave32 mode on -O0.
2455 VCMP.getValueType().getSizeInBits() == ST->getWavefrontSize()) {
2456 // %VCMP = i(WaveSize) AMDGPUISD::SETCC ...
2457 // %C = i1 ISD::SETCC %VCMP, 0, setne/seteq
2458 // BRCOND i1 %C, %BB
2459 // =>
2460 // %VCMP = i(WaveSize) AMDGPUISD::SETCC ...
2461 // VCC = COPY i(WaveSize) %VCMP
2462 // S_CBRANCH_VCCNZ/VCCZ %BB
2463 Negate = CC == ISD::SETEQ;
2464 bool NegatedBallot = false;
2465 if (auto BallotCond = combineBallotPattern(VCMP, NegatedBallot)) {
2466 Cond = BallotCond;
2467 UseSCCBr = !BallotCond->isDivergent();
2468 Negate = Negate ^ NegatedBallot;
2469 } else {
2470 // TODO: don't use SCC here assuming that AMDGPUISD::SETCC is always
2471 // selected as V_CMP, but this may change for uniform condition.
2472 Cond = VCMP;
2473 UseSCCBr = false;
2474 }
2475 }
2476 // Cond is either V_CMP resulted from AMDGPUISD::SETCC or a combination of
2477 // V_CMPs resulted from ballot or ballot has uniform condition and SCC is
2478 // used.
2479 AndExec = false;
2480 }
2481
2482 unsigned BrOp =
2483 UseSCCBr ? (Negate ? AMDGPU::S_CBRANCH_SCC0 : AMDGPU::S_CBRANCH_SCC1)
2484 : (Negate ? AMDGPU::S_CBRANCH_VCCZ : AMDGPU::S_CBRANCH_VCCNZ);
2485 Register CondReg = UseSCCBr ? AMDGPU::SCC : TRI->getVCC();
2486 SDLoc SL(N);
2487
2488 if (AndExec) {
2489 // This is the case that we are selecting to S_CBRANCH_VCCNZ. We have not
2490 // analyzed what generates the vcc value, so we do not know whether vcc
2491 // bits for disabled lanes are 0. Thus we need to mask out bits for
2492 // disabled lanes.
2493 //
2494 // For the case that we select S_CBRANCH_SCC1 and it gets
2495 // changed to S_CBRANCH_VCCNZ in SIFixSGPRCopies, SIFixSGPRCopies calls
2496 // SIInstrInfo::moveToVALU which inserts the S_AND).
2497 //
2498 // We could add an analysis of what generates the vcc value here and omit
2499 // the S_AND when is unnecessary. But it would be better to add a separate
2500 // pass after SIFixSGPRCopies to do the unnecessary S_AND removal, so it
2501 // catches both cases.
2502 Cond = SDValue(CurDAG->getMachineNode(ST->isWave32() ? AMDGPU::S_AND_B32
2503 : AMDGPU::S_AND_B64,
2504 SL, MVT::i1,
2505 CurDAG->getRegister(ST->isWave32() ? AMDGPU::EXEC_LO
2506 : AMDGPU::EXEC,
2507 MVT::i1),
2508 Cond),
2509 0);
2510 }
2511
2512 SDValue VCC = CurDAG->getCopyToReg(N->getOperand(0), SL, CondReg, Cond);
2513 CurDAG->SelectNodeTo(N, BrOp, MVT::Other,
2514 N->getOperand(2), // Basic Block
2515 VCC.getValue(0));
2516}
2517
2518void AMDGPUDAGToDAGISel::SelectFP_EXTEND(SDNode *N) {
2519 if (Subtarget->hasSALUFloatInsts() && N->getValueType(0) == MVT::f32 &&
2520 !N->isDivergent()) {
2521 SDValue Src = N->getOperand(0);
2522 if (Src.getValueType() == MVT::f16) {
2523 if (isExtractHiElt(Src, Src)) {
2524 CurDAG->SelectNodeTo(N, AMDGPU::S_CVT_HI_F32_F16, N->getVTList(),
2525 {Src});
2526 return;
2527 }
2528 }
2529 }
2530
2531 SelectCode(N);
2532}
2533
2534void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode *N, unsigned IntrID) {
2535 // The address is assumed to be uniform, so if it ends up in a VGPR, it will
2536 // be copied to an SGPR with readfirstlane.
2537 unsigned Opc = IntrID == Intrinsic::amdgcn_ds_append ?
2538 AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
2539
2540 SDValue Chain = N->getOperand(0);
2541 SDValue Ptr = N->getOperand(2);
2542 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2543 MachineMemOperand *MMO = M->getMemOperand();
2544 bool IsGDS = M->getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
2545
2548 SDValue PtrBase = Ptr.getOperand(0);
2549 SDValue PtrOffset = Ptr.getOperand(1);
2550
2551 const APInt &OffsetVal = PtrOffset->getAsAPIntVal();
2552 if (isDSOffsetLegal(PtrBase, OffsetVal.getZExtValue())) {
2553 N = glueCopyToM0(N, PtrBase);
2554 Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i32);
2555 }
2556 }
2557
2558 if (!Offset) {
2559 N = glueCopyToM0(N, Ptr);
2560 Offset = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32);
2561 }
2562
2563 SDValue Ops[] = {
2564 Offset,
2565 CurDAG->getTargetConstant(IsGDS, SDLoc(), MVT::i32),
2566 Chain,
2567 N->getOperand(N->getNumOperands() - 1) // New glue
2568 };
2569
2570 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
2571 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
2572}
2573
2574// We need to handle this here because tablegen doesn't support matching
2575// instructions with multiple outputs.
2576void AMDGPUDAGToDAGISel::SelectDSBvhStackIntrinsic(SDNode *N) {
2577 unsigned Opc = AMDGPU::DS_BVH_STACK_RTN_B32;
2578 SDValue Ops[] = {N->getOperand(2), N->getOperand(3), N->getOperand(4),
2579 N->getOperand(5), N->getOperand(0)};
2580
2581 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2582 MachineMemOperand *MMO = M->getMemOperand();
2583 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
2584 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
2585}
2586
2587static unsigned gwsIntrinToOpcode(unsigned IntrID) {
2588 switch (IntrID) {
2589 case Intrinsic::amdgcn_ds_gws_init:
2590 return AMDGPU::DS_GWS_INIT;
2591 case Intrinsic::amdgcn_ds_gws_barrier:
2592 return AMDGPU::DS_GWS_BARRIER;
2593 case Intrinsic::amdgcn_ds_gws_sema_v:
2594 return AMDGPU::DS_GWS_SEMA_V;
2595 case Intrinsic::amdgcn_ds_gws_sema_br:
2596 return AMDGPU::DS_GWS_SEMA_BR;
2597 case Intrinsic::amdgcn_ds_gws_sema_p:
2598 return AMDGPU::DS_GWS_SEMA_P;
2599 case Intrinsic::amdgcn_ds_gws_sema_release_all:
2600 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
2601 default:
2602 llvm_unreachable("not a gws intrinsic");
2603 }
2604}
2605
2606void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) {
2607 if (!Subtarget->hasGWS() ||
2608 (IntrID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
2609 !Subtarget->hasGWSSemaReleaseAll())) {
2610 // Let this error.
2611 SelectCode(N);
2612 return;
2613 }
2614
2615 // Chain, intrinsic ID, vsrc, offset
2616 const bool HasVSrc = N->getNumOperands() == 4;
2617 assert(HasVSrc || N->getNumOperands() == 3);
2618
2619 SDLoc SL(N);
2620 SDValue BaseOffset = N->getOperand(HasVSrc ? 3 : 2);
2621 int ImmOffset = 0;
2622 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2623 MachineMemOperand *MMO = M->getMemOperand();
2624
2625 // Don't worry if the offset ends up in a VGPR. Only one lane will have
2626 // effect, so SIFixSGPRCopies will validly insert readfirstlane.
2627
2628 // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
2629 // offset field) % 64. Some versions of the programming guide omit the m0
2630 // part, or claim it's from offset 0.
2631 if (ConstantSDNode *ConstOffset = dyn_cast<ConstantSDNode>(BaseOffset)) {
2632 // If we have a constant offset, try to use the 0 in m0 as the base.
2633 // TODO: Look into changing the default m0 initialization value. If the
2634 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
2635 // the immediate offset.
2636 glueCopyToM0(N, CurDAG->getTargetConstant(0, SL, MVT::i32));
2637 ImmOffset = ConstOffset->getZExtValue();
2638 } else {
2639 if (CurDAG->isBaseWithConstantOffset(BaseOffset)) {
2640 ImmOffset = BaseOffset.getConstantOperandVal(1);
2641 BaseOffset = BaseOffset.getOperand(0);
2642 }
2643
2644 // Prefer to do the shift in an SGPR since it should be possible to use m0
2645 // as the result directly. If it's already an SGPR, it will be eliminated
2646 // later.
2647 SDNode *SGPROffset
2648 = CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL, MVT::i32,
2649 BaseOffset);
2650 // Shift to offset in m0
2651 SDNode *M0Base
2652 = CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,
2653 SDValue(SGPROffset, 0),
2654 CurDAG->getTargetConstant(16, SL, MVT::i32));
2655 glueCopyToM0(N, SDValue(M0Base, 0));
2656 }
2657
2658 SDValue Chain = N->getOperand(0);
2659 SDValue OffsetField = CurDAG->getTargetConstant(ImmOffset, SL, MVT::i32);
2660
2661 const unsigned Opc = gwsIntrinToOpcode(IntrID);
2663 if (HasVSrc)
2664 Ops.push_back(N->getOperand(2));
2665 Ops.push_back(OffsetField);
2666 Ops.push_back(Chain);
2667
2668 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
2669 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
2670}
2671
2672void AMDGPUDAGToDAGISel::SelectInterpP1F16(SDNode *N) {
2673 if (Subtarget->getLDSBankCount() != 16) {
2674 // This is a single instruction with a pattern.
2675 SelectCode(N);
2676 return;
2677 }
2678
2679 SDLoc DL(N);
2680
2681 // This requires 2 instructions. It is possible to write a pattern to support
2682 // this, but the generated isel emitter doesn't correctly deal with multiple
2683 // output instructions using the same physical register input. The copy to m0
2684 // is incorrectly placed before the second instruction.
2685 //
2686 // TODO: Match source modifiers.
2687 //
2688 // def : Pat <
2689 // (int_amdgcn_interp_p1_f16
2690 // (VOP3Mods f32:$src0, i32:$src0_modifiers),
2691 // (i32 timm:$attrchan), (i32 timm:$attr),
2692 // (i1 timm:$high), M0),
2693 // (V_INTERP_P1LV_F16 $src0_modifiers, VGPR_32:$src0, timm:$attr,
2694 // timm:$attrchan, 0,
2695 // (V_INTERP_MOV_F32 2, timm:$attr, timm:$attrchan), timm:$high)> {
2696 // let Predicates = [has16BankLDS];
2697 // }
2698
2699 // 16 bank LDS
2700 SDValue ToM0 = CurDAG->getCopyToReg(CurDAG->getEntryNode(), DL, AMDGPU::M0,
2701 N->getOperand(5), SDValue());
2702
2703 SDVTList VTs = CurDAG->getVTList(MVT::f32, MVT::Other);
2704
2705 SDNode *InterpMov =
2706 CurDAG->getMachineNode(AMDGPU::V_INTERP_MOV_F32, DL, VTs, {
2707 CurDAG->getTargetConstant(2, DL, MVT::i32), // P0
2708 N->getOperand(3), // Attr
2709 N->getOperand(2), // Attrchan
2710 ToM0.getValue(1) // In glue
2711 });
2712
2713 SDNode *InterpP1LV =
2714 CurDAG->getMachineNode(AMDGPU::V_INTERP_P1LV_F16, DL, MVT::f32, {
2715 CurDAG->getTargetConstant(0, DL, MVT::i32), // $src0_modifiers
2716 N->getOperand(1), // Src0
2717 N->getOperand(3), // Attr
2718 N->getOperand(2), // Attrchan
2719 CurDAG->getTargetConstant(0, DL, MVT::i32), // $src2_modifiers
2720 SDValue(InterpMov, 0), // Src2 - holds two f16 values selected by high
2721 N->getOperand(4), // high
2722 CurDAG->getTargetConstant(0, DL, MVT::i1), // $clamp
2723 CurDAG->getTargetConstant(0, DL, MVT::i32), // $omod
2724 SDValue(InterpMov, 1)
2725 });
2726
2727 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), SDValue(InterpP1LV, 0));
2728}
2729
2730void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
2731 unsigned IntrID = N->getConstantOperandVal(1);
2732 switch (IntrID) {
2733 case Intrinsic::amdgcn_ds_append:
2734 case Intrinsic::amdgcn_ds_consume: {
2735 if (N->getValueType(0) != MVT::i32)
2736 break;
2737 SelectDSAppendConsume(N, IntrID);
2738 return;
2739 }
2740 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2741 SelectDSBvhStackIntrinsic(N);
2742 return;
2743 }
2744
2745 SelectCode(N);
2746}
2747
2748void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) {
2749 unsigned IntrID = N->getConstantOperandVal(0);
2750 unsigned Opcode = AMDGPU::INSTRUCTION_LIST_END;
2751 SDNode *ConvGlueNode = N->getGluedNode();
2752 if (ConvGlueNode) {
2753 // FIXME: Possibly iterate over multiple glue nodes?
2754 assert(ConvGlueNode->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
2755 ConvGlueNode = ConvGlueNode->getOperand(0).getNode();
2756 ConvGlueNode =
2757 CurDAG->getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, {},
2758 MVT::Glue, SDValue(ConvGlueNode, 0));
2759 } else {
2760 ConvGlueNode = nullptr;
2761 }
2762 switch (IntrID) {
2763 case Intrinsic::amdgcn_wqm:
2764 Opcode = AMDGPU::WQM;
2765 break;
2766 case Intrinsic::amdgcn_softwqm:
2767 Opcode = AMDGPU::SOFT_WQM;
2768 break;
2769 case Intrinsic::amdgcn_wwm:
2770 case Intrinsic::amdgcn_strict_wwm:
2771 Opcode = AMDGPU::STRICT_WWM;
2772 break;
2773 case Intrinsic::amdgcn_strict_wqm:
2774 Opcode = AMDGPU::STRICT_WQM;
2775 break;
2776 case Intrinsic::amdgcn_interp_p1_f16:
2777 SelectInterpP1F16(N);
2778 return;
2779 default:
2780 SelectCode(N);
2781 break;
2782 }
2783
2784 if (Opcode != AMDGPU::INSTRUCTION_LIST_END) {
2785 SDValue Src = N->getOperand(1);
2786 CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), {Src});
2787 }
2788
2789 if (ConvGlueNode) {
2790 SmallVector<SDValue, 4> NewOps(N->ops());
2791 NewOps.push_back(SDValue(ConvGlueNode, 0));
2792 CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), NewOps);
2793 }
2794}
2795
2796void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) {
2797 unsigned IntrID = N->getConstantOperandVal(1);
2798 switch (IntrID) {
2799 case Intrinsic::amdgcn_ds_gws_init:
2800 case Intrinsic::amdgcn_ds_gws_barrier:
2801 case Intrinsic::amdgcn_ds_gws_sema_v:
2802 case Intrinsic::amdgcn_ds_gws_sema_br:
2803 case Intrinsic::amdgcn_ds_gws_sema_p:
2804 case Intrinsic::amdgcn_ds_gws_sema_release_all:
2805 SelectDS_GWS(N, IntrID);
2806 return;
2807 default:
2808 break;
2809 }
2810
2811 SelectCode(N);
2812}
2813
2814void AMDGPUDAGToDAGISel::SelectWAVE_ADDRESS(SDNode *N) {
2815 SDValue Log2WaveSize =
2816 CurDAG->getTargetConstant(Subtarget->getWavefrontSizeLog2(), SDLoc(N), MVT::i32);
2817 CurDAG->SelectNodeTo(N, AMDGPU::S_LSHR_B32, N->getVTList(),
2818 {N->getOperand(0), Log2WaveSize});
2819}
2820
2821void AMDGPUDAGToDAGISel::SelectSTACKRESTORE(SDNode *N) {
2822 SDValue SrcVal = N->getOperand(1);
2823 if (SrcVal.getValueType() != MVT::i32) {
2824 SelectCode(N); // Emit default error
2825 return;
2826 }
2827
2828 SDValue CopyVal;
2830 SDLoc SL(N);
2831
2832 if (SrcVal.getOpcode() == AMDGPUISD::WAVE_ADDRESS) {
2833 CopyVal = SrcVal.getOperand(0);
2834 } else {
2835 SDValue Log2WaveSize = CurDAG->getTargetConstant(
2836 Subtarget->getWavefrontSizeLog2(), SL, MVT::i32);
2837
2838 if (N->isDivergent()) {
2839 SrcVal = SDValue(CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL,
2840 MVT::i32, SrcVal),
2841 0);
2842 }
2843
2844 CopyVal = SDValue(CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,
2845 {SrcVal, Log2WaveSize}),
2846 0);
2847 }
2848
2849 SDValue CopyToSP = CurDAG->getCopyToReg(N->getOperand(0), SL, SP, CopyVal);
2850 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), CopyToSP);
2851}
2852
2853bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src,
2854 unsigned &Mods,
2855 bool IsCanonicalizing,
2856 bool AllowAbs) const {
2857 Mods = SISrcMods::NONE;
2858 Src = In;
2859
2860 if (Src.getOpcode() == ISD::FNEG) {
2861 Mods |= SISrcMods::NEG;
2862 Src = Src.getOperand(0);
2863 } else if (Src.getOpcode() == ISD::FSUB && IsCanonicalizing) {
2864 // Fold fsub [+-]0 into fneg. This may not have folded depending on the
2865 // denormal mode, but we're implicitly canonicalizing in a source operand.
2866 auto *LHS = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
2867 if (LHS && LHS->isZero()) {
2868 Mods |= SISrcMods::NEG;
2869 Src = Src.getOperand(1);
2870 }
2871 }
2872
2873 if (AllowAbs && Src.getOpcode() == ISD::FABS) {
2874 Mods |= SISrcMods::ABS;
2875 Src = Src.getOperand(0);
2876 }
2877
2878 return true;
2879}
2880
2881bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src,
2882 SDValue &SrcMods) const {
2883 unsigned Mods;
2884 if (SelectVOP3ModsImpl(In, Src, Mods, /*IsCanonicalizing=*/true,
2885 /*AllowAbs=*/true)) {
2886 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2887 return true;
2888 }
2889
2890 return false;
2891}
2892
2893bool AMDGPUDAGToDAGISel::SelectVOP3ModsNonCanonicalizing(
2894 SDValue In, SDValue &Src, SDValue &SrcMods) const {
2895 unsigned Mods;
2896 if (SelectVOP3ModsImpl(In, Src, Mods, /*IsCanonicalizing=*/false,
2897 /*AllowAbs=*/true)) {
2898 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2899 return true;
2900 }
2901
2902 return false;
2903}
2904
2905bool AMDGPUDAGToDAGISel::SelectVOP3BMods(SDValue In, SDValue &Src,
2906 SDValue &SrcMods) const {
2907 unsigned Mods;
2908 if (SelectVOP3ModsImpl(In, Src, Mods,
2909 /*IsCanonicalizing=*/true,
2910 /*AllowAbs=*/false)) {
2911 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2912 return true;
2913 }
2914
2915 return false;
2916}
2917
2918bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src) const {
2919 if (In.getOpcode() == ISD::FABS || In.getOpcode() == ISD::FNEG)
2920 return false;
2921
2922 Src = In;
2923 return true;
2924}
2925
2926bool AMDGPUDAGToDAGISel::SelectVINTERPModsImpl(SDValue In, SDValue &Src,
2927 SDValue &SrcMods,
2928 bool OpSel) const {
2929 unsigned Mods;
2930 if (SelectVOP3ModsImpl(In, Src, Mods,
2931 /*IsCanonicalizing=*/true,
2932 /*AllowAbs=*/false)) {
2933 if (OpSel)
2934 Mods |= SISrcMods::OP_SEL_0;
2935 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2936 return true;
2937 }
2938
2939 return false;
2940}
2941
2942bool AMDGPUDAGToDAGISel::SelectVINTERPMods(SDValue In, SDValue &Src,
2943 SDValue &SrcMods) const {
2944 return SelectVINTERPModsImpl(In, Src, SrcMods, /* OpSel */ false);
2945}
2946
2947bool AMDGPUDAGToDAGISel::SelectVINTERPModsHi(SDValue In, SDValue &Src,
2948 SDValue &SrcMods) const {
2949 return SelectVINTERPModsImpl(In, Src, SrcMods, /* OpSel */ true);
2950}
2951
2952bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src,
2953 SDValue &SrcMods, SDValue &Clamp,
2954 SDValue &Omod) const {
2955 SDLoc DL(In);
2956 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
2957 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
2958
2959 return SelectVOP3Mods(In, Src, SrcMods);
2960}
2961
2962bool AMDGPUDAGToDAGISel::SelectVOP3BMods0(SDValue In, SDValue &Src,
2963 SDValue &SrcMods, SDValue &Clamp,
2964 SDValue &Omod) const {
2965 SDLoc DL(In);
2966 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
2967 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
2968
2969 return SelectVOP3BMods(In, Src, SrcMods);
2970}
2971
2972bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In, SDValue &Src,
2973 SDValue &Clamp, SDValue &Omod) const {
2974 Src = In;
2975
2976 SDLoc DL(In);
2977 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
2978 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
2979
2980 return true;
2981}
2982
2983bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
2984 SDValue &SrcMods, bool IsDOT) const {
2985 unsigned Mods = SISrcMods::NONE;
2986 Src = In;
2987
2988 // TODO: Handle G_FSUB 0 as fneg
2989 if (Src.getOpcode() == ISD::FNEG) {
2991 Src = Src.getOperand(0);
2992 }
2993
2994 if (Src.getOpcode() == ISD::BUILD_VECTOR && Src.getNumOperands() == 2 &&
2995 (!IsDOT || !Subtarget->hasDOTOpSelHazard())) {
2996 unsigned VecMods = Mods;
2997
2998 SDValue Lo = stripBitcast(Src.getOperand(0));
2999 SDValue Hi = stripBitcast(Src.getOperand(1));
3000
3001 if (Lo.getOpcode() == ISD::FNEG) {
3002 Lo = stripBitcast(Lo.getOperand(0));
3003 Mods ^= SISrcMods::NEG;
3004 }
3005
3006 if (Hi.getOpcode() == ISD::FNEG) {
3007 Hi = stripBitcast(Hi.getOperand(0));
3008 Mods ^= SISrcMods::NEG_HI;
3009 }
3010
3011 if (isExtractHiElt(Lo, Lo))
3012 Mods |= SISrcMods::OP_SEL_0;
3013
3014 if (isExtractHiElt(Hi, Hi))
3015 Mods |= SISrcMods::OP_SEL_1;
3016
3017 unsigned VecSize = Src.getValueSizeInBits();
3018 Lo = stripExtractLoElt(Lo);
3019 Hi = stripExtractLoElt(Hi);
3020
3021 if (Lo.getValueSizeInBits() > VecSize) {
3023 (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
3024 MVT::getIntegerVT(VecSize), Lo);
3025 }
3026
3027 if (Hi.getValueSizeInBits() > VecSize) {
3029 (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
3030 MVT::getIntegerVT(VecSize), Hi);
3031 }
3032
3033 assert(Lo.getValueSizeInBits() <= VecSize &&
3034 Hi.getValueSizeInBits() <= VecSize);
3035
3036 if (Lo == Hi && !isInlineImmediate(Lo.getNode())) {
3037 // Really a scalar input. Just select from the low half of the register to
3038 // avoid packing.
3039
3040 if (VecSize == 32 || VecSize == Lo.getValueSizeInBits()) {
3041 Src = Lo;
3042 } else {
3043 assert(Lo.getValueSizeInBits() == 32 && VecSize == 64);
3044
3045 SDLoc SL(In);
3047 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, SL,
3048 Lo.getValueType()), 0);
3049 auto RC = Lo->isDivergent() ? AMDGPU::VReg_64RegClassID
3050 : AMDGPU::SReg_64RegClassID;
3051 const SDValue Ops[] = {
3052 CurDAG->getTargetConstant(RC, SL, MVT::i32),
3053 Lo, CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
3054 Undef, CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32) };
3055
3056 Src = SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, SL,
3057 Src.getValueType(), Ops), 0);
3058 }
3059 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3060 return true;
3061 }
3062
3063 if (VecSize == 64 && Lo == Hi && isa<ConstantFPSDNode>(Lo)) {
3064 uint64_t Lit = cast<ConstantFPSDNode>(Lo)->getValueAPF()
3065 .bitcastToAPInt().getZExtValue();
3066 if (AMDGPU::isInlinableLiteral32(Lit, Subtarget->hasInv2PiInlineImm())) {
3067 Src = CurDAG->getTargetConstant(Lit, SDLoc(In), MVT::i64);
3068 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3069 return true;
3070 }
3071 }
3072
3073 Mods = VecMods;
3074 }
3075
3076 // Packed instructions do not have abs modifiers.
3077 Mods |= SISrcMods::OP_SEL_1;
3078
3079 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3080 return true;
3081}
3082
3083bool AMDGPUDAGToDAGISel::SelectVOP3PModsDOT(SDValue In, SDValue &Src,
3084 SDValue &SrcMods) const {
3085 return SelectVOP3PMods(In, Src, SrcMods, true);
3086}
3087
3088bool AMDGPUDAGToDAGISel::SelectVOP3PModsNeg(SDValue In, SDValue &Src) const {
3089 const ConstantSDNode *C = cast<ConstantSDNode>(In);
3090 // Literal i1 value set in intrinsic, represents SrcMods for the next operand.
3091 // 1 promotes packed values to signed, 0 treats them as unsigned.
3092 assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value");
3093
3094 unsigned Mods = SISrcMods::OP_SEL_1;
3095 unsigned SrcSign = C->getZExtValue();
3096 if (SrcSign == 1)
3097 Mods ^= SISrcMods::NEG;
3098
3099 Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3100 return true;
3101}
3102
3103bool AMDGPUDAGToDAGISel::SelectWMMAOpSelVOP3PMods(SDValue In,
3104 SDValue &Src) const {
3105 const ConstantSDNode *C = cast<ConstantSDNode>(In);
3106 assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value");
3107
3108 unsigned Mods = SISrcMods::OP_SEL_1;
3109 unsigned SrcVal = C->getZExtValue();
3110 if (SrcVal == 1)
3111 Mods |= SISrcMods::OP_SEL_0;
3112
3113 Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3114 return true;
3115}
3116
3118 llvm::SelectionDAG *CurDAG,
3119 const SDLoc &DL) {
3120 unsigned DstRegClass;
3121 EVT DstTy;
3122 switch (Elts.size()) {
3123 case 8:
3124 DstRegClass = AMDGPU::VReg_256RegClassID;
3125 DstTy = MVT::v8i32;
3126 break;
3127 case 4:
3128 DstRegClass = AMDGPU::VReg_128RegClassID;
3129 DstTy = MVT::v4i32;
3130 break;
3131 case 2:
3132 DstRegClass = AMDGPU::VReg_64RegClassID;
3133 DstTy = MVT::v2i32;
3134 break;
3135 default:
3136 llvm_unreachable("unhandled Reg sequence size");
3137 }
3138
3140 Ops.push_back(CurDAG->getTargetConstant(DstRegClass, DL, MVT::i32));
3141 for (unsigned i = 0; i < Elts.size(); ++i) {
3142 Ops.push_back(Elts[i]);
3143 Ops.push_back(CurDAG->getTargetConstant(
3145 }
3146 return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, DstTy, Ops);
3147}
3148
3150 llvm::SelectionDAG *CurDAG,
3151 const SDLoc &DL) {
3152 SmallVector<SDValue, 8> PackedElts;
3153 assert("unhandled Reg sequence size" &&
3154 (Elts.size() == 8 || Elts.size() == 16));
3155
3156 // Pack 16-bit elements in pairs into 32-bit register. If both elements are
3157 // unpacked from 32-bit source use it, otherwise pack them using v_perm.
3158 for (unsigned i = 0; i < Elts.size(); i += 2) {
3159 SDValue LoSrc = stripExtractLoElt(stripBitcast(Elts[i]));
3160 SDValue HiSrc;
3161 if (isExtractHiElt(Elts[i + 1], HiSrc) && LoSrc == HiSrc) {
3162 PackedElts.push_back(HiSrc);
3163 } else {
3164 SDValue PackLoLo = CurDAG->getTargetConstant(0x05040100, DL, MVT::i32);
3165 MachineSDNode *Packed =
3166 CurDAG->getMachineNode(AMDGPU::V_PERM_B32_e64, DL, MVT::i32,
3167 {Elts[i + 1], Elts[i], PackLoLo});
3168 PackedElts.push_back(SDValue(Packed, 0));
3169 }
3170 }
3171
3172 return buildRegSequence32(PackedElts, CurDAG, DL);
3173}
3174
3176 llvm::SelectionDAG *CurDAG,
3177 const SDLoc &DL, unsigned ElementSize) {
3178 if (ElementSize == 16)
3179 return buildRegSequence16(Elts, CurDAG, DL);
3180 if (ElementSize == 32)
3181 return buildRegSequence32(Elts, CurDAG, DL);
3182 llvm_unreachable("Unhandled element size");
3183}
3184
3185static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods,
3187 llvm::SelectionDAG *CurDAG, const SDLoc &DL,
3188 unsigned ElementSize) {
3189 if (ModOpcode == ISD::FNEG) {
3190 Mods |= SISrcMods::NEG;
3191 // Check if all elements also have abs modifier
3192 SmallVector<SDValue, 8> NegAbsElts;
3193 for (auto El : Elts) {
3194 if (El.getOpcode() != ISD::FABS)
3195 break;
3196 NegAbsElts.push_back(El->getOperand(0));
3197 }
3198 if (Elts.size() != NegAbsElts.size()) {
3199 // Neg
3200 Src = SDValue(buildRegSequence(Elts, CurDAG, DL, ElementSize), 0);
3201 } else {
3202 // Neg and Abs
3203 Mods |= SISrcMods::NEG_HI;
3204 Src = SDValue(buildRegSequence(NegAbsElts, CurDAG, DL, ElementSize), 0);
3205 }
3206 } else {
3207 assert(ModOpcode == ISD::FABS);
3208 // Abs
3209 Mods |= SISrcMods::NEG_HI;
3210 Src = SDValue(buildRegSequence(Elts, CurDAG, DL, ElementSize), 0);
3211 }
3212}
3213
3214// Check all f16 elements for modifiers while looking through b32 and v2b16
3215// build vector, stop if element does not satisfy ModifierCheck.
3216static void
3218 std::function<bool(SDValue)> ModifierCheck) {
3219 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3220 if (auto *F16Pair =
3221 dyn_cast<BuildVectorSDNode>(stripBitcast(BV->getOperand(i)))) {
3222 for (unsigned i = 0; i < F16Pair->getNumOperands(); ++i) {
3223 SDValue ElF16 = stripBitcast(F16Pair->getOperand(i));
3224 if (!ModifierCheck(ElF16))
3225 break;
3226 }
3227 }
3228 }
3229}
3230
3231bool AMDGPUDAGToDAGISel::SelectWMMAModsF16Neg(SDValue In, SDValue &Src,
3232 SDValue &SrcMods) const {
3233 Src = In;
3234 unsigned Mods = SISrcMods::OP_SEL_1;
3235
3236 // mods are on f16 elements
3237 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3239
3240 checkWMMAElementsModifiersF16(BV, [&](SDValue Element) -> bool {
3241 if (Element.getOpcode() != ISD::FNEG)
3242 return false;
3243 EltsF16.push_back(Element.getOperand(0));
3244 return true;
3245 });
3246
3247 // All elements have neg modifier
3248 if (BV->getNumOperands() * 2 == EltsF16.size()) {
3249 Src = SDValue(buildRegSequence16(EltsF16, CurDAG, SDLoc(In)), 0);
3250 Mods |= SISrcMods::NEG;
3251 Mods |= SISrcMods::NEG_HI;
3252 }
3253 }
3254
3255 // mods are on v2f16 elements
3256 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3257 SmallVector<SDValue, 8> EltsV2F16;
3258 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3259 SDValue ElV2f16 = stripBitcast(BV->getOperand(i));
3260 // Based on first element decide which mod we match, neg or abs
3261 if (ElV2f16.getOpcode() != ISD::FNEG)
3262 break;
3263 EltsV2F16.push_back(ElV2f16.getOperand(0));
3264 }
3265
3266 // All pairs of elements have neg modifier
3267 if (BV->getNumOperands() == EltsV2F16.size()) {
3268 Src = SDValue(buildRegSequence32(EltsV2F16, CurDAG, SDLoc(In)), 0);
3269 Mods |= SISrcMods::NEG;
3270 Mods |= SISrcMods::NEG_HI;
3271 }
3272 }
3273
3274 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3275 return true;
3276}
3277
3278bool AMDGPUDAGToDAGISel::SelectWMMAModsF16NegAbs(SDValue In, SDValue &Src,
3279 SDValue &SrcMods) const {
3280 Src = In;
3281 unsigned Mods = SISrcMods::OP_SEL_1;
3282 unsigned ModOpcode;
3283
3284 // mods are on f16 elements
3285 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3287 checkWMMAElementsModifiersF16(BV, [&](SDValue ElF16) -> bool {
3288 // Based on first element decide which mod we match, neg or abs
3289 if (EltsF16.empty())
3290 ModOpcode = (ElF16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3291 if (ElF16.getOpcode() != ModOpcode)
3292 return false;
3293 EltsF16.push_back(ElF16.getOperand(0));
3294 return true;
3295 });
3296
3297 // All elements have ModOpcode modifier
3298 if (BV->getNumOperands() * 2 == EltsF16.size())
3299 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF16, Src, CurDAG, SDLoc(In),
3300 16);
3301 }
3302
3303 // mods are on v2f16 elements
3304 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3305 SmallVector<SDValue, 8> EltsV2F16;
3306
3307 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3308 SDValue ElV2f16 = stripBitcast(BV->getOperand(i));
3309 // Based on first element decide which mod we match, neg or abs
3310 if (EltsV2F16.empty())
3311 ModOpcode = (ElV2f16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3312 if (ElV2f16->getOpcode() != ModOpcode)
3313 break;
3314 EltsV2F16.push_back(ElV2f16->getOperand(0));
3315 }
3316
3317 // All elements have ModOpcode modifier
3318 if (BV->getNumOperands() == EltsV2F16.size())
3319 selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, CurDAG, SDLoc(In),
3320 32);
3321 }
3322
3323 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3324 return true;
3325}
3326
3327bool AMDGPUDAGToDAGISel::SelectWMMAModsF32NegAbs(SDValue In, SDValue &Src,
3328 SDValue &SrcMods) const {
3329 Src = In;
3330 unsigned Mods = SISrcMods::OP_SEL_1;
3332
3333 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3334 assert(BV->getNumOperands() > 0);
3335 // Based on first element decide which mod we match, neg or abs
3336 SDValue ElF32 = stripBitcast(BV->getOperand(0));
3337 unsigned ModOpcode =
3338 (ElF32.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3339 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3340 SDValue ElF32 = stripBitcast(BV->getOperand(i));
3341 if (ElF32.getOpcode() != ModOpcode)
3342 break;
3343 EltsF32.push_back(ElF32.getOperand(0));
3344 }
3345
3346 // All elements had ModOpcode modifier
3347 if (BV->getNumOperands() == EltsF32.size())
3348 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, CurDAG, SDLoc(In),
3349 32);
3350 }
3351
3352 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3353 return true;
3354}
3355
3356bool AMDGPUDAGToDAGISel::SelectWMMAVISrc(SDValue In, SDValue &Src) const {
3357 if (auto *BV = dyn_cast<BuildVectorSDNode>(In)) {
3358 BitVector UndefElements;
3359 if (SDValue Splat = BV->getSplatValue(&UndefElements))
3360 if (isInlineImmediate(Splat.getNode())) {
3361 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Splat)) {
3362 unsigned Imm = C->getAPIntValue().getSExtValue();
3363 Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32);
3364 return true;
3365 }
3366 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Splat)) {
3367 unsigned Imm = C->getValueAPF().bitcastToAPInt().getSExtValue();
3368 Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32);
3369 return true;
3370 }
3371 llvm_unreachable("unhandled Constant node");
3372 }
3373 }
3374
3375 // 16 bit splat
3376 SDValue SplatSrc32 = stripBitcast(In);
3377 if (auto *SplatSrc32BV = dyn_cast<BuildVectorSDNode>(SplatSrc32))
3378 if (SDValue Splat32 = SplatSrc32BV->getSplatValue()) {
3379 SDValue SplatSrc16 = stripBitcast(Splat32);
3380 if (auto *SplatSrc16BV = dyn_cast<BuildVectorSDNode>(SplatSrc16))
3381 if (SDValue Splat = SplatSrc16BV->getSplatValue()) {
3382 const SIInstrInfo *TII = Subtarget->getInstrInfo();
3383 std::optional<APInt> RawValue;
3384 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Splat))
3385 RawValue = C->getValueAPF().bitcastToAPInt();
3386 else if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Splat))
3387 RawValue = C->getAPIntValue();
3388
3389 if (RawValue.has_value()) {
3390 EVT VT = In.getValueType().getScalarType();
3391 if (VT.getSimpleVT() == MVT::f16 || VT.getSimpleVT() == MVT::bf16) {
3392 APFloat FloatVal(VT.getSimpleVT() == MVT::f16
3395 RawValue.value());
3396 if (TII->isInlineConstant(FloatVal)) {
3397 Src = CurDAG->getTargetConstant(RawValue.value(), SDLoc(In),
3398 MVT::i16);
3399 return true;
3400 }
3401 } else if (VT.getSimpleVT() == MVT::i16) {
3402 if (TII->isInlineConstant(RawValue.value())) {
3403 Src = CurDAG->getTargetConstant(RawValue.value(), SDLoc(In),
3404 MVT::i16);
3405 return true;
3406 }
3407 } else
3408 llvm_unreachable("unknown 16-bit type");
3409 }
3410 }
3411 }
3412
3413 return false;
3414}
3415
3416bool AMDGPUDAGToDAGISel::SelectSWMMACIndex8(SDValue In, SDValue &Src,
3417 SDValue &IndexKey) const {
3418 unsigned Key = 0;
3419 Src = In;
3420
3421 if (In.getOpcode() == ISD::SRL) {
3422 const llvm::SDValue &ShiftSrc = In.getOperand(0);
3423 ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(In.getOperand(1));
3424 if (ShiftSrc.getValueType().getSizeInBits() == 32 && ShiftAmt &&
3425 ShiftAmt->getZExtValue() % 8 == 0) {
3426 Key = ShiftAmt->getZExtValue() / 8;
3427 Src = ShiftSrc;
3428 }
3429 }
3430
3431 IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);
3432 return true;
3433}
3434
3435bool AMDGPUDAGToDAGISel::SelectSWMMACIndex16(SDValue In, SDValue &Src,
3436 SDValue &IndexKey) const {
3437 unsigned Key = 0;
3438 Src = In;
3439
3440 if (In.getOpcode() == ISD::SRL) {
3441 const llvm::SDValue &ShiftSrc = In.getOperand(0);
3442 ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(In.getOperand(1));
3443 if (ShiftSrc.getValueType().getSizeInBits() == 32 && ShiftAmt &&
3444 ShiftAmt->getZExtValue() == 16) {
3445 Key = 1;
3446 Src = ShiftSrc;
3447 }
3448 }
3449
3450 IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);
3451 return true;
3452}
3453
3454bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src,
3455 SDValue &SrcMods) const {
3456 Src = In;
3457 // FIXME: Handle op_sel
3458 SrcMods = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32);
3459 return true;
3460}
3461
3462bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods(SDValue In, SDValue &Src,
3463 SDValue &SrcMods) const {
3464 // FIXME: Handle op_sel
3465 return SelectVOP3Mods(In, Src, SrcMods);
3466}
3467
3468// The return value is not whether the match is possible (which it always is),
3469// but whether or not it a conversion is really used.
3470bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src,
3471 unsigned &Mods) const {
3472 Mods = 0;
3473 SelectVOP3ModsImpl(In, Src, Mods);
3474
3475 if (Src.getOpcode() == ISD::FP_EXTEND) {
3476 Src = Src.getOperand(0);
3477 assert(Src.getValueType() == MVT::f16);
3478 Src = stripBitcast(Src);
3479
3480 // Be careful about folding modifiers if we already have an abs. fneg is
3481 // applied last, so we don't want to apply an earlier fneg.
3482 if ((Mods & SISrcMods::ABS) == 0) {
3483 unsigned ModsTmp;
3484 SelectVOP3ModsImpl(Src, Src, ModsTmp);
3485
3486 if ((ModsTmp & SISrcMods::NEG) != 0)
3487 Mods ^= SISrcMods::NEG;
3488
3489 if ((ModsTmp & SISrcMods::ABS) != 0)
3490 Mods |= SISrcMods::ABS;
3491 }
3492
3493 // op_sel/op_sel_hi decide the source type and source.
3494 // If the source's op_sel_hi is set, it indicates to do a conversion from fp16.
3495 // If the sources's op_sel is set, it picks the high half of the source
3496 // register.
3497
3498 Mods |= SISrcMods::OP_SEL_1;
3499 if (isExtractHiElt(Src, Src)) {
3500 Mods |= SISrcMods::OP_SEL_0;
3501
3502 // TODO: Should we try to look for neg/abs here?
3503 }
3504
3505 return true;
3506 }
3507
3508 return false;
3509}
3510
3511bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsExt(SDValue In, SDValue &Src,
3512 SDValue &SrcMods) const {
3513 unsigned Mods = 0;
3514 if (!SelectVOP3PMadMixModsImpl(In, Src, Mods))
3515 return false;
3516 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3517 return true;
3518}
3519
3520bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src,
3521 SDValue &SrcMods) const {
3522 unsigned Mods = 0;
3523 SelectVOP3PMadMixModsImpl(In, Src, Mods);
3524 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3525 return true;
3526}
3527
3528SDValue AMDGPUDAGToDAGISel::getHi16Elt(SDValue In) const {
3529 if (In.isUndef())
3530 return CurDAG->getUNDEF(MVT::i32);
3531
3532 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(In)) {
3533 SDLoc SL(In);
3534 return CurDAG->getConstant(C->getZExtValue() << 16, SL, MVT::i32);
3535 }
3536
3537 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(In)) {
3538 SDLoc SL(In);
3539 return CurDAG->getConstant(
3540 C->getValueAPF().bitcastToAPInt().getZExtValue() << 16, SL, MVT::i32);
3541 }
3542
3543 SDValue Src;
3544 if (isExtractHiElt(In, Src))
3545 return Src;
3546
3547 return SDValue();
3548}
3549
3550bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
3552
3553 const SIRegisterInfo *SIRI =
3554 static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
3555 const SIInstrInfo * SII =
3556 static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
3557
3558 unsigned Limit = 0;
3559 bool AllUsesAcceptSReg = true;
3560 for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
3561 Limit < 10 && U != E; ++U, ++Limit) {
3562 const TargetRegisterClass *RC = getOperandRegClass(*U, U.getOperandNo());
3563
3564 // If the register class is unknown, it could be an unknown
3565 // register class that needs to be an SGPR, e.g. an inline asm
3566 // constraint
3567 if (!RC || SIRI->isSGPRClass(RC))
3568 return false;
3569
3570 if (RC != &AMDGPU::VS_32RegClass && RC != &AMDGPU::VS_64RegClass) {
3571 AllUsesAcceptSReg = false;
3572 SDNode * User = *U;
3573 if (User->isMachineOpcode()) {
3574 unsigned Opc = User->getMachineOpcode();
3575 const MCInstrDesc &Desc = SII->get(Opc);
3576 if (Desc.isCommutable()) {
3577 unsigned OpIdx = Desc.getNumDefs() + U.getOperandNo();
3578 unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex;
3579 if (SII->findCommutedOpIndices(Desc, OpIdx, CommuteIdx1)) {
3580 unsigned CommutedOpNo = CommuteIdx1 - Desc.getNumDefs();
3581 const TargetRegisterClass *CommutedRC = getOperandRegClass(*U, CommutedOpNo);
3582 if (CommutedRC == &AMDGPU::VS_32RegClass ||
3583 CommutedRC == &AMDGPU::VS_64RegClass)
3584 AllUsesAcceptSReg = true;
3585 }
3586 }
3587 }
3588 // If "AllUsesAcceptSReg == false" so far we haven't succeeded
3589 // commuting current user. This means have at least one use
3590 // that strictly require VGPR. Thus, we will not attempt to commute
3591 // other user instructions.
3592 if (!AllUsesAcceptSReg)
3593 break;
3594 }
3595 }
3596 return !AllUsesAcceptSReg && (Limit < 10);
3597}
3598
3599bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode *N) const {
3600 auto Ld = cast<LoadSDNode>(N);
3601
3602 const MachineMemOperand *MMO = Ld->getMemOperand();
3603 if (N->isDivergent() && !AMDGPUInstrInfo::isUniformMMO(MMO))
3604 return false;
3605
3606 return MMO->getSize().hasValue() &&
3607 Ld->getAlign() >=
3608 Align(std::min(MMO->getSize().getValue().getKnownMinValue(),
3609 uint64_t(4))) &&
3610 ((Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
3611 Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ||
3612 (Subtarget->getScalarizeGlobalBehavior() &&
3613 Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
3614 Ld->isSimple() &&
3615 static_cast<const SITargetLowering *>(getTargetLowering())
3616 ->isMemOpHasNoClobberedMemOperand(N)));
3617}
3618
3621 *static_cast<const AMDGPUTargetLowering*>(getTargetLowering());
3622 bool IsModified = false;
3623 do {
3624 IsModified = false;
3625
3626 // Go over all selected nodes and try to fold them a bit more
3628 while (Position != CurDAG->allnodes_end()) {
3629 SDNode *Node = &*Position++;
3630 MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(Node);
3631 if (!MachineNode)
3632 continue;
3633
3634 SDNode *ResNode = Lowering.PostISelFolding(MachineNode, *CurDAG);
3635 if (ResNode != Node) {
3636 if (ResNode)
3637 ReplaceUses(Node, ResNode);
3638 IsModified = true;
3639 }
3640 }
3642 } while (IsModified);
3643}
3644
3646 CodeGenOptLevel OptLevel)
3648 ID, std::make_unique<AMDGPUDAGToDAGISel>(TM, OptLevel)) {}
3649
unsigned const MachineRegisterInfo * MRI
static bool getBaseWithOffsetUsingSplitOR(SelectionDAG &DAG, SDValue Addr, SDValue &N0, SDValue &N1)
static MachineSDNode * buildRegSequence32(SmallVectorImpl< SDValue > &Elts, llvm::SelectionDAG *CurDAG, const SDLoc &DL)
static SDValue matchZExtFromI32(SDValue Op)
static SDValue SelectSAddrFI(SelectionDAG *CurDAG, SDValue SAddr)
static MemSDNode * findMemSDNode(SDNode *N)
static MachineSDNode * buildRegSequence16(SmallVectorImpl< SDValue > &Elts, llvm::SelectionDAG *CurDAG, const SDLoc &DL)
static bool IsCopyFromSGPR(const SIRegisterInfo &TRI, SDValue Val)
static SDValue combineBallotPattern(SDValue VCMP, bool &Negate)
static void checkWMMAElementsModifiersF16(BuildVectorSDNode *BV, std::function< bool(SDValue)> ModifierCheck)
Defines an instruction selector for the AMDGPU target.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In, Register &Out)
static unsigned gwsIntrinToOpcode(unsigned IntrID)
static Register buildRegSequence(SmallVectorImpl< Register > &Elts, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods, SmallVectorImpl< Register > &Elts, Register &Src, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
Provides AMDGPU specific target descriptions.
Base class for AMDGPU specific classes of TargetSubtarget.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Addr
uint64_t Size
#define F(x, y, z)
Definition: MD5.cpp:55
unsigned const TargetRegisterInfo * TRI
FunctionAnalysisManager FAM
if(PassOpts->AAPipeline)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:57
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
Provides R600 specific target descriptions.
Interface definition for R600RegisterInfo.
const SmallVectorImpl< MachineOperand > & Cond
SI DAG Lowering interface definition.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
LLVM IR instance of the generic uniformity analysis.
Value * RHS
Value * LHS
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
AMDGPUDAGToDAGISelLegacy(TargetMachine &TM, CodeGenOptLevel OptLevel)
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
StringRef getPassName() const override
getPassName - Return a nice clean name for a pass.
AMDGPU specific code to select AMDGPU machine instructions for SelectionDAG operations.
void SelectBuildVector(SDNode *N, unsigned RegClassID)
void Select(SDNode *N) override
Main hook for targets to transform nodes into machine nodes.
bool runOnMachineFunction(MachineFunction &MF) override
void PreprocessISelDAG() override
PreprocessISelDAG - This hook allows targets to hack on the graph before instruction selection starts...
void PostprocessISelDAG() override
PostprocessISelDAG() - This hook allows the target to hack on the graph right after selection.
bool matchLoadD16FromBuildVector(SDNode *N) const
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
AMDGPUISelDAGToDAGPass(TargetMachine &TM)
static bool isUniformMMO(const MachineMemOperand *MMO)
unsigned getWavefrontSizeLog2() const
bool hasInv2PiInlineImm() const
static SDValue stripBitcast(SDValue Val)
static int64_t getNullPointerValue(unsigned AddrSpace)
Get the integer value of a null pointer in the given address space.
Class for arbitrary precision integers.
Definition: APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:212
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1498
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1520
unsigned countr_one() const
Count the number of trailing one bits.
Definition: APInt.h:1613
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:405
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:239
A "pseudo-class" with methods for operating on BUILD_VECTORs.
SDValue getSplatValue(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted value or a null value if this is not a splat.
uint64_t getZExtValue() const
int64_t getSExtValue() const
This class represents an Operation in the Expression.
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:279
Legacy analysis pass which computes a DominatorTree.
Definition: Dominators.h:317
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:310
int getLDSBankCount() const
Definition: GCNSubtarget.h:340
bool hasUsableDSOffset() const
True if the offset field of DS instructions works as expected.
Definition: GCNSubtarget.h:468
bool unsafeDSOffsetFoldingEnabled() const
Definition: GCNSubtarget.h:472
bool hasFlatInstOffsets() const
Definition: GCNSubtarget.h:627
bool hasDLInsts() const
Definition: GCNSubtarget.h:765
const SIInstrInfo * getInstrInfo() const override
Definition: GCNSubtarget.h:266
unsigned getConstantBusLimit(unsigned Opcode) const
bool hasMADIntraFwdBug() const
bool privateMemoryResourceIsRangeChecked() const
Definition: GCNSubtarget.h:553
bool hasSignedScratchOffsets() const
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:278
bool hasDOTOpSelHazard() const
bool d16PreservesUnusedBits() const
Definition: GCNSubtarget.h:692
bool hasRestrictedSOffset() const
bool hasFlatSegmentOffsetBug() const
Definition: GCNSubtarget.h:680
bool getScalarizeGlobalBehavior() const
Definition: GCNSubtarget.h:966
bool ldsRequiresM0Init() const
Return if most LDS instructions have an m0 use that require m0 to be initialized.
Definition: GCNSubtarget.h:702
bool hasFlatScratchSVSSwizzleBug() const
bool hasGWS() const
bool useFlatForGlobal() const
Definition: GCNSubtarget.h:531
Generation getGeneration() const
Definition: GCNSubtarget.h:317
bool hasGWSSemaReleaseAll() const
Definition: GCNSubtarget.h:716
bool hasAddr64() const
Definition: GCNSubtarget.h:381
bool hasAddNoCarry() const
Definition: GCNSubtarget.h:724
bool hasSALUFloatInsts() const
void checkSubtargetFeatures(const Function &F) const
Diagnose inconsistent subtarget features before attempting to codegen function F.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
bool hasValue() const
TypeSize getValue() const
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:566
SmallVector< LoopT *, 4 > getLoopsInPreorder() const
Return all of the loops in the function in preorder across the loop nests, with siblings in forward p...
The legacy pass manager's analysis pass to compute loop information.
Definition: LoopInfo.h:593
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
const Triple & getTargetTriple() const
Machine Value Type.
static MVT getIntegerVT(unsigned BitWidth)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
An SDNode that represents everything that will be needed to construct a MachineInstr.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getChain() const
EVT getMemoryVT() const
Return the type of the in-memory value.
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:111
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
This class provides iterator support for SDUse operands that use a specific SDNode.
Represents one node in the SelectionDAG.
const APInt & getAsAPIntVal() const
Helper method returns the APInt value of a ConstantSDNode.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool isDivergent() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumOperands() const
Return the number of values used by this operation.
const SDValue & getOperand(unsigned Num) const
bool isPredecessorOf(const SDNode *N) const
Return true if this node is a predecessor of N.
static use_iterator use_end()
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
const SDValue & getOperand(unsigned i) const
uint64_t getConstantOperandVal(unsigned i) const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx0, unsigned &SrcOpIdx1) const override
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
const TargetRegisterClass * getRegClass(unsigned RCID) const
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isSGPRClass(const TargetRegisterClass *RC)
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
SelectionDAGISel - This is the common base class used for SelectionDAG-based pattern-matching instruc...
std::unique_ptr< FunctionLoweringInfo > FuncInfo
const TargetLowering * TLI
MachineFunction * MF
const TargetInstrInfo * TII
void ReplaceUses(SDValue F, SDValue T)
ReplaceUses - replace all uses of the old node F with the use of the new node T.
void ReplaceNode(SDNode *F, SDNode *T)
Replace all uses of F with T, then remove F from the DAG.
virtual bool runOnMachineFunction(MachineFunction &mf)
const TargetLowering * getTargetLowering() const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:226
const TargetSubtargetInfo & getSubtarget() const
Definition: SelectionDAG.h:489
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
SDNode * SelectNodeTo(SDNode *N, unsigned MachineOpc, EVT VT)
These are used for target selectors to mutate the specified node to have the specified return type,...
SDNode * MorphNodeTo(SDNode *N, unsigned Opc, SDVTList VTs, ArrayRef< SDValue > Ops)
This mutates the specified node to have the specified return type, opcode, and operands.
allnodes_const_iterator allnodes_begin() const
Definition: SelectionDAG.h:547
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
allnodes_const_iterator allnodes_end() const
Definition: SelectionDAG.h:548
void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
SDValue getTargetFrameIndex(int FI, EVT VT)
Definition: SelectionDAG.h:741
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
bool SignBitIsZero(SDValue Op, unsigned Depth=0) const
Return true if the sign bit of Op is known to be zero.
SDValue getRegister(unsigned Reg, EVT VT)
void RemoveDeadNodes()
This method deletes all unreachable nodes in the SelectionDAG.
void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
const TargetMachine & getTarget() const
Definition: SelectionDAG.h:488
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, unsigned Reg, SDValue N)
Definition: SelectionDAG.h:787
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:690
bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:482
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition: SelectionDAG.h:570
ilist< SDNode >::iterator allnodes_iterator
Definition: SelectionDAG.h:550
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
static const unsigned CommuteAnyOperandIndex
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
const Triple & getTargetTriple() const
unsigned getID() const
Return the register class ID number.
ArchType getArch() const
Get the parsed architecture type of this triple.
Definition: Triple.h:373
Legacy analysis pass which computes a CycleInfo.
LLVM Value Representation.
Definition: Value.h:74
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:168
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
@ CLAMP
CLAMP value between 0.0 and 1.0.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
Key
PAL metadata keys.
std::optional< int64_t > getSMRDEncodedLiteralOffset32(const MCSubtargetInfo &ST, int64_t ByteOffset)
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isValid32BitLiteral(uint64_t Val, bool IsFP64)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST)
std::optional< int64_t > getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset, bool IsBuffer, bool HasSOffset)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:779
@ STACKRESTORE
STACKRESTORE has two operands, an input chain and a pointer to restore to it returns an output chain.
Definition: ISDOpcodes.h:1194
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:257
@ ConstantFP
Definition: ISDOpcodes.h:77
@ ADDC
Carry-setting nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:276
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition: ISDOpcodes.h:501
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:246
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1099
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:497
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition: ISDOpcodes.h:205
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:840
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:397
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:953
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition: ISDOpcodes.h:236
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
Definition: ISDOpcodes.h:996
@ CONVERGENCECTRL_GLUE
Definition: ISDOpcodes.h:1466
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition: ISDOpcodes.h:634
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:980
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition: ISDOpcodes.h:514
@ UNDEF
UNDEF - An undefined node.
Definition: ISDOpcodes.h:218
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition: ISDOpcodes.h:215
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:734
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:549
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:209
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:810
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:1028
@ TargetFrameIndex
Definition: ISDOpcodes.h:172
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:848
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:938
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:310
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:708
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:190
@ ADDE
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:286
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:919
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:816
@ BRCOND
BRCOND - Conditional branch.
Definition: ISDOpcodes.h:1138
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:198
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition: ISDOpcodes.h:529
bool isExtOpcode(unsigned Opcode)
Definition: ISDOpcodes.h:1674
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1603
@ Undef
Value of the register doesn't matter.
constexpr const char32_t SBase
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Offset
Definition: DWP.cpp:480
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
constexpr bool isMask_32(uint32_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition: MathExtras.h:267
bool isBoolSGPR(SDValue V)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:154
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
CodeGenOptLevel
Code generation optimization level.
Definition: CodeGen.h:54
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:159
static SDNode * packConstantV2I16(const SDNode *N, SelectionDAG &DAG)
FunctionPass * createAMDGPUISelDag(TargetMachine &TM, CodeGenOptLevel OptLevel)
This pass converts a legalized DAG into a AMDGPU-specific.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ Add
Sum of integers.
unsigned M0(unsigned Val)
Definition: VE.h:375
Implement std::hash so that hash_code can be used in STL containers.
Definition: BitVector.h:858
#define N
static const fltSemantics & IEEEhalf() LLVM_READNONE
Definition: APFloat.cpp:279
static const fltSemantics & BFloat() LLVM_READNONE
Definition: APFloat.cpp:280
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Description of the encoding of one expression Op.
Extended Value Type.
Definition: ValueTypes.h:35
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:359
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:371
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:307
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition: ValueTypes.h:247
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:319
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:157
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:327
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition: KnownBits.h:290
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition: KnownBits.h:333
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition: KnownBits.h:134
APInt getMinValue() const
Return the minimal unsigned value possible given these KnownBits.
Definition: KnownBits.h:118
static unsigned getSubRegFromChannel(unsigned Channel)
This represents a list of ValueType's that has been intern'd by a SelectionDAG.