LLVM 20.0.0git
AMDGPUISelDAGToDAG.cpp
Go to the documentation of this file.
1//===-- AMDGPUISelDAGToDAG.cpp - A dag to dag inst selector for AMDGPU ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//==-----------------------------------------------------------------------===//
8//
9/// \file
10/// Defines an instruction selector for the AMDGPU target.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPUISelDAGToDAG.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPUSubtarget.h"
18#include "AMDGPUTargetMachine.h"
21#include "R600RegisterInfo.h"
22#include "SIISelLowering.h"
29#include "llvm/IR/IntrinsicsAMDGPU.h"
31
32#ifdef EXPENSIVE_CHECKS
34#include "llvm/IR/Dominators.h"
35#endif
36
37#define DEBUG_TYPE "amdgpu-isel"
38
39using namespace llvm;
40
41//===----------------------------------------------------------------------===//
42// Instruction Selector Implementation
43//===----------------------------------------------------------------------===//
44
45namespace {
46static SDValue stripBitcast(SDValue Val) {
47 return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val;
48}
49
50// Figure out if this is really an extract of the high 16-bits of a dword.
51static bool isExtractHiElt(SDValue In, SDValue &Out) {
52 In = stripBitcast(In);
53
54 if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
55 if (ConstantSDNode *Idx = dyn_cast<ConstantSDNode>(In.getOperand(1))) {
56 if (!Idx->isOne())
57 return false;
58 Out = In.getOperand(0);
59 return true;
60 }
61 }
62
63 if (In.getOpcode() != ISD::TRUNCATE)
64 return false;
65
66 SDValue Srl = In.getOperand(0);
67 if (Srl.getOpcode() == ISD::SRL) {
68 if (ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
69 if (ShiftAmt->getZExtValue() == 16) {
70 Out = stripBitcast(Srl.getOperand(0));
71 return true;
72 }
73 }
74 }
75
76 return false;
77}
78
79// Look through operations that obscure just looking at the low 16-bits of the
80// same register.
81static SDValue stripExtractLoElt(SDValue In) {
82 if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
83 SDValue Idx = In.getOperand(1);
84 if (isNullConstant(Idx) && In.getValueSizeInBits() <= 32)
85 return In.getOperand(0);
86 }
87
88 if (In.getOpcode() == ISD::TRUNCATE) {
89 SDValue Src = In.getOperand(0);
90 if (Src.getValueType().getSizeInBits() == 32)
91 return stripBitcast(Src);
92 }
93
94 return In;
95}
96
97} // end anonymous namespace
98
100 "AMDGPU DAG->DAG Pattern Instruction Selection", false,
101 false)
103INITIALIZE_PASS_DEPENDENCY(AMDGPUPerfHintAnalysisLegacy)
105#ifdef EXPENSIVE_CHECKS
108#endif
110 "AMDGPU DAG->DAG Pattern Instruction Selection", false,
111 false)
112
113/// This pass converts a legalized DAG into a AMDGPU-specific
114// DAG, ready for instruction scheduling.
116 CodeGenOptLevel OptLevel) {
117 return new AMDGPUDAGToDAGISelLegacy(TM, OptLevel);
118}
119
121 CodeGenOptLevel OptLevel)
122 : SelectionDAGISel(TM, OptLevel) {}
123
125 Subtarget = &MF.getSubtarget<GCNSubtarget>();
127 Mode = SIModeRegisterDefaults(MF.getFunction(), *Subtarget);
129}
130
131bool AMDGPUDAGToDAGISel::fp16SrcZerosHighBits(unsigned Opc) const {
132 // XXX - only need to list legal operations.
133 switch (Opc) {
134 case ISD::FADD:
135 case ISD::FSUB:
136 case ISD::FMUL:
137 case ISD::FDIV:
138 case ISD::FREM:
140 case ISD::UINT_TO_FP:
141 case ISD::SINT_TO_FP:
142 case ISD::FABS:
143 // Fabs is lowered to a bit operation, but it's an and which will clear the
144 // high bits anyway.
145 case ISD::FSQRT:
146 case ISD::FSIN:
147 case ISD::FCOS:
148 case ISD::FPOWI:
149 case ISD::FPOW:
150 case ISD::FLOG:
151 case ISD::FLOG2:
152 case ISD::FLOG10:
153 case ISD::FEXP:
154 case ISD::FEXP2:
155 case ISD::FCEIL:
156 case ISD::FTRUNC:
157 case ISD::FRINT:
158 case ISD::FNEARBYINT:
159 case ISD::FROUNDEVEN:
160 case ISD::FROUND:
161 case ISD::FFLOOR:
162 case ISD::FMINNUM:
163 case ISD::FMAXNUM:
164 case ISD::FLDEXP:
165 case AMDGPUISD::FRACT:
166 case AMDGPUISD::CLAMP:
169 case AMDGPUISD::FMIN3:
170 case AMDGPUISD::FMAX3:
171 case AMDGPUISD::FMED3:
173 case AMDGPUISD::RCP:
174 case AMDGPUISD::RSQ:
176 // On gfx10, all 16-bit instructions preserve the high bits.
177 return Subtarget->getGeneration() <= AMDGPUSubtarget::GFX9;
178 case ISD::FP_ROUND:
179 // We may select fptrunc (fma/mad) to mad_mixlo, which does not zero the
180 // high bits on gfx9.
181 // TODO: If we had the source node we could see if the source was fma/mad
183 case ISD::FMA:
184 case ISD::FMAD:
187 default:
188 // fcopysign, select and others may be lowered to 32-bit bit operations
189 // which don't zero the high bits.
190 return false;
191 }
192}
193
195#ifdef EXPENSIVE_CHECKS
196 DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
197 LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
198 for (auto &L : LI->getLoopsInPreorder()) {
199 assert(L->isLCSSAForm(DT));
200 }
201#endif
203}
204
208#ifdef EXPENSIVE_CHECKS
211#endif
213}
214
216 assert(Subtarget->d16PreservesUnusedBits());
217 MVT VT = N->getValueType(0).getSimpleVT();
218 if (VT != MVT::v2i16 && VT != MVT::v2f16)
219 return false;
220
221 SDValue Lo = N->getOperand(0);
222 SDValue Hi = N->getOperand(1);
223
224 LoadSDNode *LdHi = dyn_cast<LoadSDNode>(stripBitcast(Hi));
225
226 // build_vector lo, (load ptr) -> load_d16_hi ptr, lo
227 // build_vector lo, (zextload ptr from i8) -> load_d16_hi_u8 ptr, lo
228 // build_vector lo, (sextload ptr from i8) -> load_d16_hi_i8 ptr, lo
229
230 // Need to check for possible indirect dependencies on the other half of the
231 // vector to avoid introducing a cycle.
232 if (LdHi && Hi.hasOneUse() && !LdHi->isPredecessorOf(Lo.getNode())) {
233 SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
234
236 SDValue Ops[] = {
237 LdHi->getChain(), LdHi->getBasePtr(), TiedIn
238 };
239
240 unsigned LoadOp = AMDGPUISD::LOAD_D16_HI;
241 if (LdHi->getMemoryVT() == MVT::i8) {
242 LoadOp = LdHi->getExtensionType() == ISD::SEXTLOAD ?
244 } else {
245 assert(LdHi->getMemoryVT() == MVT::i16);
246 }
247
248 SDValue NewLoadHi =
249 CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdHi), VTList,
250 Ops, LdHi->getMemoryVT(),
251 LdHi->getMemOperand());
252
253 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadHi);
254 CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdHi, 1), NewLoadHi.getValue(1));
255 return true;
256 }
257
258 // build_vector (load ptr), hi -> load_d16_lo ptr, hi
259 // build_vector (zextload ptr from i8), hi -> load_d16_lo_u8 ptr, hi
260 // build_vector (sextload ptr from i8), hi -> load_d16_lo_i8 ptr, hi
261 LoadSDNode *LdLo = dyn_cast<LoadSDNode>(stripBitcast(Lo));
262 if (LdLo && Lo.hasOneUse()) {
263 SDValue TiedIn = getHi16Elt(Hi);
264 if (!TiedIn || LdLo->isPredecessorOf(TiedIn.getNode()))
265 return false;
266
267 SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
268 unsigned LoadOp = AMDGPUISD::LOAD_D16_LO;
269 if (LdLo->getMemoryVT() == MVT::i8) {
270 LoadOp = LdLo->getExtensionType() == ISD::SEXTLOAD ?
272 } else {
273 assert(LdLo->getMemoryVT() == MVT::i16);
274 }
275
276 TiedIn = CurDAG->getNode(ISD::BITCAST, SDLoc(N), VT, TiedIn);
277
278 SDValue Ops[] = {
279 LdLo->getChain(), LdLo->getBasePtr(), TiedIn
280 };
281
282 SDValue NewLoadLo =
283 CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdLo), VTList,
284 Ops, LdLo->getMemoryVT(),
285 LdLo->getMemOperand());
286
287 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadLo);
288 CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdLo, 1), NewLoadLo.getValue(1));
289 return true;
290 }
291
292 return false;
293}
294
296 if (!Subtarget->d16PreservesUnusedBits())
297 return;
298
300
301 bool MadeChange = false;
302 while (Position != CurDAG->allnodes_begin()) {
303 SDNode *N = &*--Position;
304 if (N->use_empty())
305 continue;
306
307 switch (N->getOpcode()) {
309 // TODO: Match load d16 from shl (extload:i16), 16
310 MadeChange |= matchLoadD16FromBuildVector(N);
311 break;
312 default:
313 break;
314 }
315 }
316
317 if (MadeChange) {
319 LLVM_DEBUG(dbgs() << "After PreProcess:\n";
320 CurDAG->dump(););
321 }
322}
323
324bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N) const {
325 if (N->isUndef())
326 return true;
327
328 const SIInstrInfo *TII = Subtarget->getInstrInfo();
329 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N))
330 return TII->isInlineConstant(C->getAPIntValue());
331
332 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N))
333 return TII->isInlineConstant(C->getValueAPF());
334
335 return false;
336}
337
338/// Determine the register class for \p OpNo
339/// \returns The register class of the virtual register that will be used for
340/// the given operand number \OpNo or NULL if the register class cannot be
341/// determined.
342const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
343 unsigned OpNo) const {
344 if (!N->isMachineOpcode()) {
345 if (N->getOpcode() == ISD::CopyToReg) {
346 Register Reg = cast<RegisterSDNode>(N->getOperand(1))->getReg();
347 if (Reg.isVirtual()) {
349 return MRI.getRegClass(Reg);
350 }
351
352 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
353 return TRI->getPhysRegBaseClass(Reg);
354 }
355
356 return nullptr;
357 }
358
359 switch (N->getMachineOpcode()) {
360 default: {
361 const MCInstrDesc &Desc =
362 Subtarget->getInstrInfo()->get(N->getMachineOpcode());
363 unsigned OpIdx = Desc.getNumDefs() + OpNo;
364 if (OpIdx >= Desc.getNumOperands())
365 return nullptr;
366 int RegClass = Desc.operands()[OpIdx].RegClass;
367 if (RegClass == -1)
368 return nullptr;
369
370 return Subtarget->getRegisterInfo()->getRegClass(RegClass);
371 }
372 case AMDGPU::REG_SEQUENCE: {
373 unsigned RCID = N->getConstantOperandVal(0);
374 const TargetRegisterClass *SuperRC =
375 Subtarget->getRegisterInfo()->getRegClass(RCID);
376
377 SDValue SubRegOp = N->getOperand(OpNo + 1);
378 unsigned SubRegIdx = SubRegOp->getAsZExtVal();
379 return Subtarget->getRegisterInfo()->getSubClassWithSubReg(SuperRC,
380 SubRegIdx);
381 }
382 }
383}
384
385SDNode *AMDGPUDAGToDAGISel::glueCopyToOp(SDNode *N, SDValue NewChain,
386 SDValue Glue) const {
387 SmallVector <SDValue, 8> Ops;
388 Ops.push_back(NewChain); // Replace the chain.
389 for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
390 Ops.push_back(N->getOperand(i));
391
392 Ops.push_back(Glue);
393 return CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops);
394}
395
396SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N, SDValue Val) const {
398 *static_cast<const SITargetLowering*>(getTargetLowering());
399
400 assert(N->getOperand(0).getValueType() == MVT::Other && "Expected chain");
401
402 SDValue M0 = Lowering.copyToM0(*CurDAG, N->getOperand(0), SDLoc(N), Val);
403 return glueCopyToOp(N, M0, M0.getValue(1));
404}
405
406SDNode *AMDGPUDAGToDAGISel::glueCopyToM0LDSInit(SDNode *N) const {
407 unsigned AS = cast<MemSDNode>(N)->getAddressSpace();
408 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
409 if (Subtarget->ldsRequiresM0Init())
410 return glueCopyToM0(
411 N, CurDAG->getSignedTargetConstant(-1, SDLoc(N), MVT::i32));
412 } else if (AS == AMDGPUAS::REGION_ADDRESS) {
414 unsigned Value = MF.getInfo<SIMachineFunctionInfo>()->getGDSSize();
415 return
416 glueCopyToM0(N, CurDAG->getTargetConstant(Value, SDLoc(N), MVT::i32));
417 }
418 return N;
419}
420
421MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm,
422 EVT VT) const {
424 AMDGPU::S_MOV_B32, DL, MVT::i32,
425 CurDAG->getTargetConstant(Lo_32(Imm), DL, MVT::i32));
427 AMDGPU::S_MOV_B32, DL, MVT::i32,
428 CurDAG->getTargetConstant(Hi_32(Imm), DL, MVT::i32));
429 const SDValue Ops[] = {
430 CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
431 SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
432 SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
433
434 return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, VT, Ops);
435}
436
437void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
438 EVT VT = N->getValueType(0);
439 unsigned NumVectorElts = VT.getVectorNumElements();
440 EVT EltVT = VT.getVectorElementType();
441 SDLoc DL(N);
442 SDValue RegClass = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
443
444 if (NumVectorElts == 1) {
445 CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT, N->getOperand(0),
446 RegClass);
447 return;
448 }
449
450 assert(NumVectorElts <= 32 && "Vectors with more than 32 elements not "
451 "supported yet");
452 // 32 = Max Num Vector Elements
453 // 2 = 2 REG_SEQUENCE operands per element (value, subreg index)
454 // 1 = Vector Register Class
455 SmallVector<SDValue, 32 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1);
456
457 bool IsGCN = CurDAG->getSubtarget().getTargetTriple().getArch() ==
459 RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
460 bool IsRegSeq = true;
461 unsigned NOps = N->getNumOperands();
462 for (unsigned i = 0; i < NOps; i++) {
463 // XXX: Why is this here?
464 if (isa<RegisterSDNode>(N->getOperand(i))) {
465 IsRegSeq = false;
466 break;
467 }
468 unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
470 RegSeqArgs[1 + (2 * i)] = N->getOperand(i);
471 RegSeqArgs[1 + (2 * i) + 1] = CurDAG->getTargetConstant(Sub, DL, MVT::i32);
472 }
473 if (NOps != NumVectorElts) {
474 // Fill in the missing undef elements if this was a scalar_to_vector.
475 assert(N->getOpcode() == ISD::SCALAR_TO_VECTOR && NOps < NumVectorElts);
476 MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
477 DL, EltVT);
478 for (unsigned i = NOps; i < NumVectorElts; ++i) {
479 unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
481 RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0);
482 RegSeqArgs[1 + (2 * i) + 1] =
483 CurDAG->getTargetConstant(Sub, DL, MVT::i32);
484 }
485 }
486
487 if (!IsRegSeq)
488 SelectCode(N);
489 CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), RegSeqArgs);
490}
491
493 EVT VT = N->getValueType(0);
494 EVT EltVT = VT.getVectorElementType();
495
496 // TODO: Handle 16-bit element vectors with even aligned masks.
497 if (!Subtarget->hasPkMovB32() || !EltVT.bitsEq(MVT::i32) ||
498 VT.getVectorNumElements() != 2) {
499 SelectCode(N);
500 return;
501 }
502
503 auto *SVN = cast<ShuffleVectorSDNode>(N);
504
505 SDValue Src0 = SVN->getOperand(0);
506 SDValue Src1 = SVN->getOperand(1);
507 ArrayRef<int> Mask = SVN->getMask();
508 SDLoc DL(N);
509
510 assert(Src0.getValueType().getVectorNumElements() == 2 && Mask.size() == 2 &&
511 Mask[0] < 4 && Mask[1] < 4);
512
513 SDValue VSrc0 = Mask[0] < 2 ? Src0 : Src1;
514 SDValue VSrc1 = Mask[1] < 2 ? Src0 : Src1;
515 unsigned Src0SubReg = Mask[0] & 1 ? AMDGPU::sub1 : AMDGPU::sub0;
516 unsigned Src1SubReg = Mask[1] & 1 ? AMDGPU::sub1 : AMDGPU::sub0;
517
518 if (Mask[0] < 0) {
519 Src0SubReg = Src1SubReg;
520 MachineSDNode *ImpDef =
521 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, VT);
522 VSrc0 = SDValue(ImpDef, 0);
523 }
524
525 if (Mask[1] < 0) {
526 Src1SubReg = Src0SubReg;
527 MachineSDNode *ImpDef =
528 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, VT);
529 VSrc1 = SDValue(ImpDef, 0);
530 }
531
532 // SGPR case needs to lower to copies.
533 //
534 // Also use subregister extract when we can directly blend the registers with
535 // a simple subregister copy.
536 //
537 // TODO: Maybe we should fold this out earlier
538 if (N->isDivergent() && Src0SubReg == AMDGPU::sub1 &&
539 Src1SubReg == AMDGPU::sub0) {
540 // The low element of the result always comes from src0.
541 // The high element of the result always comes from src1.
542 // op_sel selects the high half of src0.
543 // op_sel_hi selects the high half of src1.
544
545 unsigned Src0OpSel =
546 Src0SubReg == AMDGPU::sub1 ? SISrcMods::OP_SEL_0 : SISrcMods::NONE;
547 unsigned Src1OpSel =
548 Src1SubReg == AMDGPU::sub1 ? SISrcMods::OP_SEL_0 : SISrcMods::NONE;
549
550 // Enable op_sel_hi to avoid printing it. This should have no effect on the
551 // result.
552 Src0OpSel |= SISrcMods::OP_SEL_1;
553 Src1OpSel |= SISrcMods::OP_SEL_1;
554
555 SDValue Src0OpSelVal = CurDAG->getTargetConstant(Src0OpSel, DL, MVT::i32);
556 SDValue Src1OpSelVal = CurDAG->getTargetConstant(Src1OpSel, DL, MVT::i32);
557 SDValue ZeroMods = CurDAG->getTargetConstant(0, DL, MVT::i32);
558
559 CurDAG->SelectNodeTo(N, AMDGPU::V_PK_MOV_B32, N->getVTList(),
560 {Src0OpSelVal, VSrc0, Src1OpSelVal, VSrc1,
561 ZeroMods, // clamp
562 ZeroMods, // op_sel
563 ZeroMods, // op_sel_hi
564 ZeroMods, // neg_lo
565 ZeroMods}); // neg_hi
566 return;
567 }
568
569 SDValue ResultElt0 =
570 CurDAG->getTargetExtractSubreg(Src0SubReg, DL, EltVT, VSrc0);
571 SDValue ResultElt1 =
572 CurDAG->getTargetExtractSubreg(Src1SubReg, DL, EltVT, VSrc1);
573
574 const SDValue Ops[] = {
575 CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
576 ResultElt0, CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
577 ResultElt1, CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
578 CurDAG->SelectNodeTo(N, TargetOpcode::REG_SEQUENCE, VT, Ops);
579}
580
582 unsigned int Opc = N->getOpcode();
583 if (N->isMachineOpcode()) {
584 N->setNodeId(-1);
585 return; // Already selected.
586 }
587
588 // isa<MemSDNode> almost works but is slightly too permissive for some DS
589 // intrinsics.
590 if (Opc == ISD::LOAD || Opc == ISD::STORE || isa<AtomicSDNode>(N)) {
591 N = glueCopyToM0LDSInit(N);
592 SelectCode(N);
593 return;
594 }
595
596 switch (Opc) {
597 default:
598 break;
599 // We are selecting i64 ADD here instead of custom lower it during
600 // DAG legalization, so we can fold some i64 ADDs used for address
601 // calculation into the LOAD and STORE instructions.
602 case ISD::ADDC:
603 case ISD::ADDE:
604 case ISD::SUBC:
605 case ISD::SUBE: {
606 if (N->getValueType(0) != MVT::i64)
607 break;
608
609 SelectADD_SUB_I64(N);
610 return;
611 }
612 case ISD::UADDO_CARRY:
613 case ISD::USUBO_CARRY:
614 if (N->getValueType(0) != MVT::i32)
615 break;
616
617 SelectAddcSubb(N);
618 return;
619 case ISD::UADDO:
620 case ISD::USUBO: {
621 SelectUADDO_USUBO(N);
622 return;
623 }
625 SelectFMUL_W_CHAIN(N);
626 return;
627 }
629 SelectFMA_W_CHAIN(N);
630 return;
631 }
632
634 case ISD::BUILD_VECTOR: {
635 EVT VT = N->getValueType(0);
636 unsigned NumVectorElts = VT.getVectorNumElements();
637 if (VT.getScalarSizeInBits() == 16) {
638 if (Opc == ISD::BUILD_VECTOR && NumVectorElts == 2) {
639 if (SDNode *Packed = packConstantV2I16(N, *CurDAG)) {
640 ReplaceNode(N, Packed);
641 return;
642 }
643 }
644
645 break;
646 }
647
648 assert(VT.getVectorElementType().bitsEq(MVT::i32));
649 unsigned RegClassID =
650 SIRegisterInfo::getSGPRClassForBitWidth(NumVectorElts * 32)->getID();
651 SelectBuildVector(N, RegClassID);
652 return;
653 }
656 return;
657 case ISD::BUILD_PAIR: {
658 SDValue RC, SubReg0, SubReg1;
659 SDLoc DL(N);
660 if (N->getValueType(0) == MVT::i128) {
661 RC = CurDAG->getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32);
662 SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32);
663 SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32);
664 } else if (N->getValueType(0) == MVT::i64) {
665 RC = CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32);
666 SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
667 SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
668 } else {
669 llvm_unreachable("Unhandled value type for BUILD_PAIR");
670 }
671 const SDValue Ops[] = { RC, N->getOperand(0), SubReg0,
672 N->getOperand(1), SubReg1 };
673 ReplaceNode(N, CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL,
674 N->getValueType(0), Ops));
675 return;
676 }
677
678 case ISD::Constant:
679 case ISD::ConstantFP: {
680 if (N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N))
681 break;
682
683 uint64_t Imm;
684 if (ConstantFPSDNode *FP = dyn_cast<ConstantFPSDNode>(N)) {
685 Imm = FP->getValueAPF().bitcastToAPInt().getZExtValue();
686 if (AMDGPU::isValid32BitLiteral(Imm, true))
687 break;
688 } else {
689 ConstantSDNode *C = cast<ConstantSDNode>(N);
690 Imm = C->getZExtValue();
691 if (AMDGPU::isValid32BitLiteral(Imm, false))
692 break;
693 }
694
695 SDLoc DL(N);
696 ReplaceNode(N, buildSMovImm64(DL, Imm, N->getValueType(0)));
697 return;
698 }
700 case AMDGPUISD::BFE_U32: {
701 // There is a scalar version available, but unlike the vector version which
702 // has a separate operand for the offset and width, the scalar version packs
703 // the width and offset into a single operand. Try to move to the scalar
704 // version if the offsets are constant, so that we can try to keep extended
705 // loads of kernel arguments in SGPRs.
706
707 // TODO: Technically we could try to pattern match scalar bitshifts of
708 // dynamic values, but it's probably not useful.
709 ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
710 if (!Offset)
711 break;
712
713 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
714 if (!Width)
715 break;
716
717 bool Signed = Opc == AMDGPUISD::BFE_I32;
718
719 uint32_t OffsetVal = Offset->getZExtValue();
720 uint32_t WidthVal = Width->getZExtValue();
721
722 ReplaceNode(N, getBFE32(Signed, SDLoc(N), N->getOperand(0), OffsetVal,
723 WidthVal));
724 return;
725 }
727 SelectDIV_SCALE(N);
728 return;
729 }
732 SelectMAD_64_32(N);
733 return;
734 }
735 case ISD::SMUL_LOHI:
736 case ISD::UMUL_LOHI:
737 return SelectMUL_LOHI(N);
738 case ISD::CopyToReg: {
740 *static_cast<const SITargetLowering*>(getTargetLowering());
741 N = Lowering.legalizeTargetIndependentNode(N, *CurDAG);
742 break;
743 }
744 case ISD::AND:
745 case ISD::SRL:
746 case ISD::SRA:
748 if (N->getValueType(0) != MVT::i32)
749 break;
750
751 SelectS_BFE(N);
752 return;
753 case ISD::BRCOND:
754 SelectBRCOND(N);
755 return;
756 case ISD::FP_EXTEND:
757 SelectFP_EXTEND(N);
758 return;
764 // Hack around using a legal type if f16 is illegal.
765 if (N->getValueType(0) == MVT::i32) {
766 MVT NewVT = Opc == AMDGPUISD::CVT_PKRTZ_F16_F32 ? MVT::v2f16 : MVT::v2i16;
767 N = CurDAG->MorphNodeTo(N, N->getOpcode(), CurDAG->getVTList(NewVT),
768 { N->getOperand(0), N->getOperand(1) });
769 SelectCode(N);
770 return;
771 }
772
773 break;
774 }
776 SelectINTRINSIC_W_CHAIN(N);
777 return;
778 }
780 SelectINTRINSIC_WO_CHAIN(N);
781 return;
782 }
783 case ISD::INTRINSIC_VOID: {
784 SelectINTRINSIC_VOID(N);
785 return;
786 }
788 SelectWAVE_ADDRESS(N);
789 return;
790 }
791 case ISD::STACKRESTORE: {
792 SelectSTACKRESTORE(N);
793 return;
794 }
795 }
796
797 SelectCode(N);
798}
799
800bool AMDGPUDAGToDAGISel::isUniformBr(const SDNode *N) const {
801 const BasicBlock *BB = FuncInfo->MBB->getBasicBlock();
802 const Instruction *Term = BB->getTerminator();
803 return Term->getMetadata("amdgpu.uniform") ||
804 Term->getMetadata("structurizecfg.uniform");
805}
806
807bool AMDGPUDAGToDAGISel::isUnneededShiftMask(const SDNode *N,
808 unsigned ShAmtBits) const {
809 assert(N->getOpcode() == ISD::AND);
810
811 const APInt &RHS = N->getConstantOperandAPInt(1);
812 if (RHS.countr_one() >= ShAmtBits)
813 return true;
814
815 const APInt &LHSKnownZeros = CurDAG->computeKnownBits(N->getOperand(0)).Zero;
816 return (LHSKnownZeros | RHS).countr_one() >= ShAmtBits;
817}
818
820 SDValue &N0, SDValue &N1) {
821 if (Addr.getValueType() == MVT::i64 && Addr.getOpcode() == ISD::BITCAST &&
822 Addr.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
823 // As we split 64-bit `or` earlier, it's complicated pattern to match, i.e.
824 // (i64 (bitcast (v2i32 (build_vector
825 // (or (extract_vector_elt V, 0), OFFSET),
826 // (extract_vector_elt V, 1)))))
827 SDValue Lo = Addr.getOperand(0).getOperand(0);
828 if (Lo.getOpcode() == ISD::OR && DAG.isBaseWithConstantOffset(Lo)) {
829 SDValue BaseLo = Lo.getOperand(0);
830 SDValue BaseHi = Addr.getOperand(0).getOperand(1);
831 // Check that split base (Lo and Hi) are extracted from the same one.
832 if (BaseLo.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
834 BaseLo.getOperand(0) == BaseHi.getOperand(0) &&
835 // Lo is statically extracted from index 0.
836 isa<ConstantSDNode>(BaseLo.getOperand(1)) &&
837 BaseLo.getConstantOperandVal(1) == 0 &&
838 // Hi is statically extracted from index 0.
839 isa<ConstantSDNode>(BaseHi.getOperand(1)) &&
840 BaseHi.getConstantOperandVal(1) == 1) {
841 N0 = BaseLo.getOperand(0).getOperand(0);
842 N1 = Lo.getOperand(1);
843 return true;
844 }
845 }
846 }
847 return false;
848}
849
850bool AMDGPUDAGToDAGISel::isBaseWithConstantOffset64(SDValue Addr, SDValue &LHS,
851 SDValue &RHS) const {
853 LHS = Addr.getOperand(0);
854 RHS = Addr.getOperand(1);
855 return true;
856 }
857
858 if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, LHS, RHS)) {
859 assert(LHS && RHS && isa<ConstantSDNode>(RHS));
860 return true;
861 }
862
863 return false;
864}
865
867 return "AMDGPU DAG->DAG Pattern Instruction Selection";
868}
869
872 std::make_unique<AMDGPUDAGToDAGISel>(TM, TM.getOptLevel())) {}
873
877#ifdef EXPENSIVE_CHECKS
879 .getManager();
880 auto &F = MF.getFunction();
883 for (auto &L : LI.getLoopsInPreorder())
884 assert(L->isLCSSAForm(DT) && "Loop is not in LCSSA form!");
885#endif
886 return SelectionDAGISelPass::run(MF, MFAM);
887}
888
889//===----------------------------------------------------------------------===//
890// Complex Patterns
891//===----------------------------------------------------------------------===//
892
893bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
894 SDValue &Offset) {
895 return false;
896}
897
898bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
899 SDValue &Offset) {
901 SDLoc DL(Addr);
902
903 if ((C = dyn_cast<ConstantSDNode>(Addr))) {
904 Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
905 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
906 } else if ((Addr.getOpcode() == AMDGPUISD::DWORDADDR) &&
907 (C = dyn_cast<ConstantSDNode>(Addr.getOperand(0)))) {
908 Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
909 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
910 } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
911 (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) {
912 Base = Addr.getOperand(0);
913 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
914 } else {
915 Base = Addr;
916 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
917 }
918
919 return true;
920}
921
922SDValue AMDGPUDAGToDAGISel::getMaterializedScalarImm32(int64_t Val,
923 const SDLoc &DL) const {
925 AMDGPU::S_MOV_B32, DL, MVT::i32,
926 CurDAG->getTargetConstant(Val, DL, MVT::i32));
927 return SDValue(Mov, 0);
928}
929
930// FIXME: Should only handle uaddo_carry/usubo_carry
931void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
932 SDLoc DL(N);
933 SDValue LHS = N->getOperand(0);
934 SDValue RHS = N->getOperand(1);
935
936 unsigned Opcode = N->getOpcode();
937 bool ConsumeCarry = (Opcode == ISD::ADDE || Opcode == ISD::SUBE);
938 bool ProduceCarry =
939 ConsumeCarry || Opcode == ISD::ADDC || Opcode == ISD::SUBC;
940 bool IsAdd = Opcode == ISD::ADD || Opcode == ISD::ADDC || Opcode == ISD::ADDE;
941
942 SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
943 SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
944
945 SDNode *Lo0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
946 DL, MVT::i32, LHS, Sub0);
947 SDNode *Hi0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
948 DL, MVT::i32, LHS, Sub1);
949
950 SDNode *Lo1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
951 DL, MVT::i32, RHS, Sub0);
952 SDNode *Hi1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
953 DL, MVT::i32, RHS, Sub1);
954
955 SDVTList VTList = CurDAG->getVTList(MVT::i32, MVT::Glue);
956
957 static const unsigned OpcMap[2][2][2] = {
958 {{AMDGPU::S_SUB_U32, AMDGPU::S_ADD_U32},
959 {AMDGPU::V_SUB_CO_U32_e32, AMDGPU::V_ADD_CO_U32_e32}},
960 {{AMDGPU::S_SUBB_U32, AMDGPU::S_ADDC_U32},
961 {AMDGPU::V_SUBB_U32_e32, AMDGPU::V_ADDC_U32_e32}}};
962
963 unsigned Opc = OpcMap[0][N->isDivergent()][IsAdd];
964 unsigned CarryOpc = OpcMap[1][N->isDivergent()][IsAdd];
965
966 SDNode *AddLo;
967 if (!ConsumeCarry) {
968 SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0) };
969 AddLo = CurDAG->getMachineNode(Opc, DL, VTList, Args);
970 } else {
971 SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0), N->getOperand(2) };
972 AddLo = CurDAG->getMachineNode(CarryOpc, DL, VTList, Args);
973 }
974 SDValue AddHiArgs[] = {
975 SDValue(Hi0, 0),
976 SDValue(Hi1, 0),
977 SDValue(AddLo, 1)
978 };
979 SDNode *AddHi = CurDAG->getMachineNode(CarryOpc, DL, VTList, AddHiArgs);
980
981 SDValue RegSequenceArgs[] = {
982 CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
983 SDValue(AddLo,0),
984 Sub0,
985 SDValue(AddHi,0),
986 Sub1,
987 };
988 SDNode *RegSequence = CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
989 MVT::i64, RegSequenceArgs);
990
991 if (ProduceCarry) {
992 // Replace the carry-use
993 ReplaceUses(SDValue(N, 1), SDValue(AddHi, 1));
994 }
995
996 // Replace the remaining uses.
997 ReplaceNode(N, RegSequence);
998}
999
1000void AMDGPUDAGToDAGISel::SelectAddcSubb(SDNode *N) {
1001 SDLoc DL(N);
1002 SDValue LHS = N->getOperand(0);
1003 SDValue RHS = N->getOperand(1);
1004 SDValue CI = N->getOperand(2);
1005
1006 if (N->isDivergent()) {
1007 unsigned Opc = N->getOpcode() == ISD::UADDO_CARRY ? AMDGPU::V_ADDC_U32_e64
1008 : AMDGPU::V_SUBB_U32_e64;
1010 N, Opc, N->getVTList(),
1011 {LHS, RHS, CI,
1012 CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
1013 } else {
1014 unsigned Opc = N->getOpcode() == ISD::UADDO_CARRY ? AMDGPU::S_ADD_CO_PSEUDO
1015 : AMDGPU::S_SUB_CO_PSEUDO;
1016 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), {LHS, RHS, CI});
1017 }
1018}
1019
1020void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) {
1021 // The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned
1022 // carry out despite the _i32 name. These were renamed in VI to _U32.
1023 // FIXME: We should probably rename the opcodes here.
1024 bool IsAdd = N->getOpcode() == ISD::UADDO;
1025 bool IsVALU = N->isDivergent();
1026
1027 for (SDNode::user_iterator UI = N->user_begin(), E = N->user_end(); UI != E;
1028 ++UI)
1029 if (UI.getUse().getResNo() == 1) {
1030 if ((IsAdd && (UI->getOpcode() != ISD::UADDO_CARRY)) ||
1031 (!IsAdd && (UI->getOpcode() != ISD::USUBO_CARRY))) {
1032 IsVALU = true;
1033 break;
1034 }
1035 }
1036
1037 if (IsVALU) {
1038 unsigned Opc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
1039
1041 N, Opc, N->getVTList(),
1042 {N->getOperand(0), N->getOperand(1),
1043 CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
1044 } else {
1045 unsigned Opc = N->getOpcode() == ISD::UADDO ? AMDGPU::S_UADDO_PSEUDO
1046 : AMDGPU::S_USUBO_PSEUDO;
1047
1048 CurDAG->SelectNodeTo(N, Opc, N->getVTList(),
1049 {N->getOperand(0), N->getOperand(1)});
1050 }
1051}
1052
1053void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) {
1054 SDLoc SL(N);
1055 // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp, omod
1056 SDValue Ops[10];
1057
1058 SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[6], Ops[7]);
1059 SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
1060 SelectVOP3Mods(N->getOperand(3), Ops[5], Ops[4]);
1061 Ops[8] = N->getOperand(0);
1062 Ops[9] = N->getOperand(4);
1063
1064 // If there are no source modifiers, prefer fmac over fma because it can use
1065 // the smaller VOP2 encoding.
1066 bool UseFMAC = Subtarget->hasDLInsts() &&
1067 cast<ConstantSDNode>(Ops[0])->isZero() &&
1068 cast<ConstantSDNode>(Ops[2])->isZero() &&
1069 cast<ConstantSDNode>(Ops[4])->isZero();
1070 unsigned Opcode = UseFMAC ? AMDGPU::V_FMAC_F32_e64 : AMDGPU::V_FMA_F32_e64;
1071 CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), Ops);
1072}
1073
1074void AMDGPUDAGToDAGISel::SelectFMUL_W_CHAIN(SDNode *N) {
1075 SDLoc SL(N);
1076 // src0_modifiers, src0, src1_modifiers, src1, clamp, omod
1077 SDValue Ops[8];
1078
1079 SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[4], Ops[5]);
1080 SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
1081 Ops[6] = N->getOperand(0);
1082 Ops[7] = N->getOperand(3);
1083
1084 CurDAG->SelectNodeTo(N, AMDGPU::V_MUL_F32_e64, N->getVTList(), Ops);
1085}
1086
1087// We need to handle this here because tablegen doesn't support matching
1088// instructions with multiple outputs.
1089void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
1090 SDLoc SL(N);
1091 EVT VT = N->getValueType(0);
1092
1093 assert(VT == MVT::f32 || VT == MVT::f64);
1094
1095 unsigned Opc
1096 = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64_e64 : AMDGPU::V_DIV_SCALE_F32_e64;
1097
1098 // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp,
1099 // omod
1100 SDValue Ops[8];
1101 SelectVOP3BMods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]);
1102 SelectVOP3BMods(N->getOperand(1), Ops[3], Ops[2]);
1103 SelectVOP3BMods(N->getOperand(2), Ops[5], Ops[4]);
1104 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
1105}
1106
1107// We need to handle this here because tablegen doesn't support matching
1108// instructions with multiple outputs.
1109void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {
1110 SDLoc SL(N);
1111 bool Signed = N->getOpcode() == AMDGPUISD::MAD_I64_I32;
1112 unsigned Opc;
1113 if (Subtarget->hasMADIntraFwdBug())
1114 Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
1115 : AMDGPU::V_MAD_U64_U32_gfx11_e64;
1116 else
1117 Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
1118
1119 SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
1120 SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
1121 Clamp };
1122 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
1123}
1124
1125// We need to handle this here because tablegen doesn't support matching
1126// instructions with multiple outputs.
1127void AMDGPUDAGToDAGISel::SelectMUL_LOHI(SDNode *N) {
1128 SDLoc SL(N);
1129 bool Signed = N->getOpcode() == ISD::SMUL_LOHI;
1130 unsigned Opc;
1131 if (Subtarget->hasMADIntraFwdBug())
1132 Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
1133 : AMDGPU::V_MAD_U64_U32_gfx11_e64;
1134 else
1135 Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
1136
1137 SDValue Zero = CurDAG->getTargetConstant(0, SL, MVT::i64);
1138 SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
1139 SDValue Ops[] = {N->getOperand(0), N->getOperand(1), Zero, Clamp};
1141 Opc, SL, CurDAG->getVTList(MVT::i64, MVT::i1), Ops);
1142 if (!SDValue(N, 0).use_empty()) {
1143 SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32);
1144 SDNode *Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL,
1145 MVT::i32, SDValue(Mad, 0), Sub0);
1146 ReplaceUses(SDValue(N, 0), SDValue(Lo, 0));
1147 }
1148 if (!SDValue(N, 1).use_empty()) {
1149 SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32);
1150 SDNode *Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL,
1151 MVT::i32, SDValue(Mad, 0), Sub1);
1152 ReplaceUses(SDValue(N, 1), SDValue(Hi, 0));
1153 }
1155}
1156
1157bool AMDGPUDAGToDAGISel::isDSOffsetLegal(SDValue Base, unsigned Offset) const {
1158 if (!isUInt<16>(Offset))
1159 return false;
1160
1161 if (!Base || Subtarget->hasUsableDSOffset() ||
1162 Subtarget->unsafeDSOffsetFoldingEnabled())
1163 return true;
1164
1165 // On Southern Islands instruction with a negative base value and an offset
1166 // don't seem to work.
1167 return CurDAG->SignBitIsZero(Base);
1168}
1169
1170bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
1171 SDValue &Offset) const {
1172 SDLoc DL(Addr);
1174 SDValue N0 = Addr.getOperand(0);
1175 SDValue N1 = Addr.getOperand(1);
1176 ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1177 if (isDSOffsetLegal(N0, C1->getSExtValue())) {
1178 // (add n0, c0)
1179 Base = N0;
1180 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
1181 return true;
1182 }
1183 } else if (Addr.getOpcode() == ISD::SUB) {
1184 // sub C, x -> add (sub 0, x), C
1185 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
1186 int64_t ByteOffset = C->getSExtValue();
1187 if (isDSOffsetLegal(SDValue(), ByteOffset)) {
1188 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1189
1190 // XXX - This is kind of hacky. Create a dummy sub node so we can check
1191 // the known bits in isDSOffsetLegal. We need to emit the selected node
1192 // here, so this is thrown away.
1193 SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32,
1194 Zero, Addr.getOperand(1));
1195
1196 if (isDSOffsetLegal(Sub, ByteOffset)) {
1198 Opnds.push_back(Zero);
1199 Opnds.push_back(Addr.getOperand(1));
1200
1201 // FIXME: Select to VOP3 version for with-carry.
1202 unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1203 if (Subtarget->hasAddNoCarry()) {
1204 SubOp = AMDGPU::V_SUB_U32_e64;
1205 Opnds.push_back(
1206 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
1207 }
1208
1209 MachineSDNode *MachineSub =
1210 CurDAG->getMachineNode(SubOp, DL, MVT::i32, Opnds);
1211
1212 Base = SDValue(MachineSub, 0);
1213 Offset = CurDAG->getTargetConstant(ByteOffset, DL, MVT::i16);
1214 return true;
1215 }
1216 }
1217 }
1218 } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1219 // If we have a constant address, prefer to put the constant into the
1220 // offset. This can save moves to load the constant address since multiple
1221 // operations can share the zero base address register, and enables merging
1222 // into read2 / write2 instructions.
1223
1224 SDLoc DL(Addr);
1225
1226 if (isDSOffsetLegal(SDValue(), CAddr->getZExtValue())) {
1227 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1228 MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
1229 DL, MVT::i32, Zero);
1230 Base = SDValue(MovZero, 0);
1231 Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16);
1232 return true;
1233 }
1234 }
1235
1236 // default case
1237 Base = Addr;
1238 Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i16);
1239 return true;
1240}
1241
1242bool AMDGPUDAGToDAGISel::isDSOffset2Legal(SDValue Base, unsigned Offset0,
1243 unsigned Offset1,
1244 unsigned Size) const {
1245 if (Offset0 % Size != 0 || Offset1 % Size != 0)
1246 return false;
1247 if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
1248 return false;
1249
1250 if (!Base || Subtarget->hasUsableDSOffset() ||
1251 Subtarget->unsafeDSOffsetFoldingEnabled())
1252 return true;
1253
1254 // On Southern Islands instruction with a negative base value and an offset
1255 // don't seem to work.
1256 return CurDAG->SignBitIsZero(Base);
1257}
1258
1259// Return whether the operation has NoUnsignedWrap property.
1261 return (Addr.getOpcode() == ISD::ADD &&
1262 Addr->getFlags().hasNoUnsignedWrap()) ||
1263 Addr->getOpcode() == ISD::OR;
1264}
1265
1266// Check that the base address of flat scratch load/store in the form of `base +
1267// offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware
1268// requirement). We always treat the first operand as the base address here.
1269bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue Addr) const {
1271 return true;
1272
1273 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1274 // values.
1275 if (Subtarget->hasSignedScratchOffsets())
1276 return true;
1277
1278 auto LHS = Addr.getOperand(0);
1279 auto RHS = Addr.getOperand(1);
1280
1281 // If the immediate offset is negative and within certain range, the base
1282 // address cannot also be negative. If the base is also negative, the sum
1283 // would be either negative or much larger than the valid range of scratch
1284 // memory a thread can access.
1285 ConstantSDNode *ImmOp = nullptr;
1286 if (Addr.getOpcode() == ISD::ADD && (ImmOp = dyn_cast<ConstantSDNode>(RHS))) {
1287 if (ImmOp->getSExtValue() < 0 && ImmOp->getSExtValue() > -0x40000000)
1288 return true;
1289 }
1290
1291 return CurDAG->SignBitIsZero(LHS);
1292}
1293
1294// Check address value in SGPR/VGPR are legal for flat scratch in the form
1295// of: SGPR + VGPR.
1296bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSV(SDValue Addr) const {
1298 return true;
1299
1300 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1301 // values.
1302 if (Subtarget->hasSignedScratchOffsets())
1303 return true;
1304
1305 auto LHS = Addr.getOperand(0);
1306 auto RHS = Addr.getOperand(1);
1307 return CurDAG->SignBitIsZero(RHS) && CurDAG->SignBitIsZero(LHS);
1308}
1309
1310// Check address value in SGPR/VGPR are legal for flat scratch in the form
1311// of: SGPR + VGPR + Imm.
1312bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSVImm(SDValue Addr) const {
1313 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1314 // values.
1315 if (AMDGPU::isGFX12Plus(*Subtarget))
1316 return true;
1317
1318 auto Base = Addr.getOperand(0);
1319 auto *RHSImm = cast<ConstantSDNode>(Addr.getOperand(1));
1320 // If the immediate offset is negative and within certain range, the base
1321 // address cannot also be negative. If the base is also negative, the sum
1322 // would be either negative or much larger than the valid range of scratch
1323 // memory a thread can access.
1324 if (isNoUnsignedWrap(Base) &&
1326 (RHSImm->getSExtValue() < 0 && RHSImm->getSExtValue() > -0x40000000)))
1327 return true;
1328
1329 auto LHS = Base.getOperand(0);
1330 auto RHS = Base.getOperand(1);
1331 return CurDAG->SignBitIsZero(RHS) && CurDAG->SignBitIsZero(LHS);
1332}
1333
1334// TODO: If offset is too big, put low 16-bit into offset.
1335bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,
1336 SDValue &Offset0,
1337 SDValue &Offset1) const {
1338 return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 4);
1339}
1340
1341bool AMDGPUDAGToDAGISel::SelectDS128Bit8ByteAligned(SDValue Addr, SDValue &Base,
1342 SDValue &Offset0,
1343 SDValue &Offset1) const {
1344 return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 8);
1345}
1346
1347bool AMDGPUDAGToDAGISel::SelectDSReadWrite2(SDValue Addr, SDValue &Base,
1348 SDValue &Offset0, SDValue &Offset1,
1349 unsigned Size) const {
1350 SDLoc DL(Addr);
1351
1353 SDValue N0 = Addr.getOperand(0);
1354 SDValue N1 = Addr.getOperand(1);
1355 ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1356 unsigned OffsetValue0 = C1->getZExtValue();
1357 unsigned OffsetValue1 = OffsetValue0 + Size;
1358
1359 // (add n0, c0)
1360 if (isDSOffset2Legal(N0, OffsetValue0, OffsetValue1, Size)) {
1361 Base = N0;
1362 Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i32);
1363 Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i32);
1364 return true;
1365 }
1366 } else if (Addr.getOpcode() == ISD::SUB) {
1367 // sub C, x -> add (sub 0, x), C
1368 if (const ConstantSDNode *C =
1369 dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
1370 unsigned OffsetValue0 = C->getZExtValue();
1371 unsigned OffsetValue1 = OffsetValue0 + Size;
1372
1373 if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {
1374 SDLoc DL(Addr);
1375 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1376
1377 // XXX - This is kind of hacky. Create a dummy sub node so we can check
1378 // the known bits in isDSOffsetLegal. We need to emit the selected node
1379 // here, so this is thrown away.
1380 SDValue Sub =
1381 CurDAG->getNode(ISD::SUB, DL, MVT::i32, Zero, Addr.getOperand(1));
1382
1383 if (isDSOffset2Legal(Sub, OffsetValue0, OffsetValue1, Size)) {
1385 Opnds.push_back(Zero);
1386 Opnds.push_back(Addr.getOperand(1));
1387 unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1388 if (Subtarget->hasAddNoCarry()) {
1389 SubOp = AMDGPU::V_SUB_U32_e64;
1390 Opnds.push_back(
1391 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
1392 }
1393
1394 MachineSDNode *MachineSub = CurDAG->getMachineNode(
1395 SubOp, DL, MVT::getIntegerVT(Size * 8), Opnds);
1396
1397 Base = SDValue(MachineSub, 0);
1398 Offset0 =
1399 CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i32);
1400 Offset1 =
1401 CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i32);
1402 return true;
1403 }
1404 }
1405 }
1406 } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1407 unsigned OffsetValue0 = CAddr->getZExtValue();
1408 unsigned OffsetValue1 = OffsetValue0 + Size;
1409
1410 if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {
1411 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1412 MachineSDNode *MovZero =
1413 CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32, Zero);
1414 Base = SDValue(MovZero, 0);
1415 Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i32);
1416 Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i32);
1417 return true;
1418 }
1419 }
1420
1421 // default case
1422
1423 Base = Addr;
1424 Offset0 = CurDAG->getTargetConstant(0, DL, MVT::i32);
1425 Offset1 = CurDAG->getTargetConstant(1, DL, MVT::i32);
1426 return true;
1427}
1428
1429bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDValue &VAddr,
1430 SDValue &SOffset, SDValue &Offset,
1431 SDValue &Offen, SDValue &Idxen,
1432 SDValue &Addr64) const {
1433 // Subtarget prefers to use flat instruction
1434 // FIXME: This should be a pattern predicate and not reach here
1435 if (Subtarget->useFlatForGlobal())
1436 return false;
1437
1438 SDLoc DL(Addr);
1439
1440 Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1);
1441 Offen = CurDAG->getTargetConstant(0, DL, MVT::i1);
1442 Addr64 = CurDAG->getTargetConstant(0, DL, MVT::i1);
1443 SOffset = Subtarget->hasRestrictedSOffset()
1444 ? CurDAG->getRegister(AMDGPU::SGPR_NULL, MVT::i32)
1445 : CurDAG->getTargetConstant(0, DL, MVT::i32);
1446
1447 ConstantSDNode *C1 = nullptr;
1448 SDValue N0 = Addr;
1450 C1 = cast<ConstantSDNode>(Addr.getOperand(1));
1451 if (isUInt<32>(C1->getZExtValue()))
1452 N0 = Addr.getOperand(0);
1453 else
1454 C1 = nullptr;
1455 }
1456
1457 if (N0.getOpcode() == ISD::ADD) {
1458 // (add N2, N3) -> addr64, or
1459 // (add (add N2, N3), C1) -> addr64
1460 SDValue N2 = N0.getOperand(0);
1461 SDValue N3 = N0.getOperand(1);
1462 Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
1463
1464 if (N2->isDivergent()) {
1465 if (N3->isDivergent()) {
1466 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
1467 // addr64, and construct the resource from a 0 address.
1468 Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
1469 VAddr = N0;
1470 } else {
1471 // N2 is divergent, N3 is not.
1472 Ptr = N3;
1473 VAddr = N2;
1474 }
1475 } else {
1476 // N2 is not divergent.
1477 Ptr = N2;
1478 VAddr = N3;
1479 }
1480 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1481 } else if (N0->isDivergent()) {
1482 // N0 is divergent. Use it as the addr64, and construct the resource from a
1483 // 0 address.
1484 Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
1485 VAddr = N0;
1486 Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
1487 } else {
1488 // N0 -> offset, or
1489 // (N0 + C1) -> offset
1490 VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32);
1491 Ptr = N0;
1492 }
1493
1494 if (!C1) {
1495 // No offset.
1496 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1497 return true;
1498 }
1499
1500 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1501 if (TII->isLegalMUBUFImmOffset(C1->getZExtValue())) {
1502 // Legal offset for instruction.
1503 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
1504 return true;
1505 }
1506
1507 // Illegal offset, store it in soffset.
1508 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1509 SOffset =
1511 AMDGPU::S_MOV_B32, DL, MVT::i32,
1512 CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32)),
1513 0);
1514 return true;
1515}
1516
1517bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
1518 SDValue &VAddr, SDValue &SOffset,
1519 SDValue &Offset) const {
1520 SDValue Ptr, Offen, Idxen, Addr64;
1521
1522 // addr64 bit was removed for volcanic islands.
1523 // FIXME: This should be a pattern predicate and not reach here
1524 if (!Subtarget->hasAddr64())
1525 return false;
1526
1527 if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
1528 return false;
1529
1530 ConstantSDNode *C = cast<ConstantSDNode>(Addr64);
1531 if (C->getSExtValue()) {
1532 SDLoc DL(Addr);
1533
1534 const SITargetLowering& Lowering =
1535 *static_cast<const SITargetLowering*>(getTargetLowering());
1536
1537 SRsrc = SDValue(Lowering.wrapAddr64Rsrc(*CurDAG, DL, Ptr), 0);
1538 return true;
1539 }
1540
1541 return false;
1542}
1543
1544std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const {
1545 SDLoc DL(N);
1546
1547 auto *FI = dyn_cast<FrameIndexSDNode>(N);
1548 SDValue TFI =
1549 FI ? CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0)) : N;
1550
1551 // We rebase the base address into an absolute stack address and hence
1552 // use constant 0 for soffset. This value must be retained until
1553 // frame elimination and eliminateFrameIndex will choose the appropriate
1554 // frame register if need be.
1555 return std::pair(TFI, CurDAG->getTargetConstant(0, DL, MVT::i32));
1556}
1557
1558bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent,
1559 SDValue Addr, SDValue &Rsrc,
1560 SDValue &VAddr, SDValue &SOffset,
1561 SDValue &ImmOffset) const {
1562
1563 SDLoc DL(Addr);
1566
1567 Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1568
1569 if (ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1570 int64_t Imm = CAddr->getSExtValue();
1571 const int64_t NullPtr =
1573 // Don't fold null pointer.
1574 if (Imm != NullPtr) {
1575 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
1576 SDValue HighBits =
1577 CurDAG->getTargetConstant(Imm & ~MaxOffset, DL, MVT::i32);
1578 MachineSDNode *MovHighBits = CurDAG->getMachineNode(
1579 AMDGPU::V_MOV_B32_e32, DL, MVT::i32, HighBits);
1580 VAddr = SDValue(MovHighBits, 0);
1581
1582 SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1583 ImmOffset = CurDAG->getTargetConstant(Imm & MaxOffset, DL, MVT::i32);
1584 return true;
1585 }
1586 }
1587
1589 // (add n0, c1)
1590
1591 SDValue N0 = Addr.getOperand(0);
1592 uint64_t C1 = Addr.getConstantOperandVal(1);
1593
1594 // Offsets in vaddr must be positive if range checking is enabled.
1595 //
1596 // The total computation of vaddr + soffset + offset must not overflow. If
1597 // vaddr is negative, even if offset is 0 the sgpr offset add will end up
1598 // overflowing.
1599 //
1600 // Prior to gfx9, MUBUF instructions with the vaddr offset enabled would
1601 // always perform a range check. If a negative vaddr base index was used,
1602 // this would fail the range check. The overall address computation would
1603 // compute a valid address, but this doesn't happen due to the range
1604 // check. For out-of-bounds MUBUF loads, a 0 is returned.
1605 //
1606 // Therefore it should be safe to fold any VGPR offset on gfx9 into the
1607 // MUBUF vaddr, but not on older subtargets which can only do this if the
1608 // sign bit is known 0.
1609 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1610 if (TII->isLegalMUBUFImmOffset(C1) &&
1611 (!Subtarget->privateMemoryResourceIsRangeChecked() ||
1612 CurDAG->SignBitIsZero(N0))) {
1613 std::tie(VAddr, SOffset) = foldFrameIndex(N0);
1614 ImmOffset = CurDAG->getTargetConstant(C1, DL, MVT::i32);
1615 return true;
1616 }
1617 }
1618
1619 // (node)
1620 std::tie(VAddr, SOffset) = foldFrameIndex(Addr);
1621 ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1622 return true;
1623}
1624
1625static bool IsCopyFromSGPR(const SIRegisterInfo &TRI, SDValue Val) {
1626 if (Val.getOpcode() != ISD::CopyFromReg)
1627 return false;
1628 auto Reg = cast<RegisterSDNode>(Val.getOperand(1))->getReg();
1629 if (!Reg.isPhysical())
1630 return false;
1631 const auto *RC = TRI.getPhysRegBaseClass(Reg);
1632 return RC && TRI.isSGPRClass(RC);
1633}
1634
1635bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent,
1636 SDValue Addr,
1637 SDValue &SRsrc,
1638 SDValue &SOffset,
1639 SDValue &Offset) const {
1640 const SIRegisterInfo *TRI =
1641 static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
1642 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1645 SDLoc DL(Addr);
1646
1647 // CopyFromReg <sgpr>
1648 if (IsCopyFromSGPR(*TRI, Addr)) {
1649 SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1650 SOffset = Addr;
1651 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1652 return true;
1653 }
1654
1655 ConstantSDNode *CAddr;
1656 if (Addr.getOpcode() == ISD::ADD) {
1657 // Add (CopyFromReg <sgpr>) <constant>
1658 CAddr = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
1659 if (!CAddr || !TII->isLegalMUBUFImmOffset(CAddr->getZExtValue()))
1660 return false;
1661 if (!IsCopyFromSGPR(*TRI, Addr.getOperand(0)))
1662 return false;
1663
1664 SOffset = Addr.getOperand(0);
1665 } else if ((CAddr = dyn_cast<ConstantSDNode>(Addr)) &&
1666 TII->isLegalMUBUFImmOffset(CAddr->getZExtValue())) {
1667 // <constant>
1668 SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1669 } else {
1670 return false;
1671 }
1672
1673 SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1674
1675 Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i32);
1676 return true;
1677}
1678
1679bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
1680 SDValue &SOffset, SDValue &Offset
1681 ) const {
1682 SDValue Ptr, VAddr, Offen, Idxen, Addr64;
1683 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1684
1685 if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
1686 return false;
1687
1688 if (!cast<ConstantSDNode>(Offen)->getSExtValue() &&
1689 !cast<ConstantSDNode>(Idxen)->getSExtValue() &&
1690 !cast<ConstantSDNode>(Addr64)->getSExtValue()) {
1691 uint64_t Rsrc = TII->getDefaultRsrcDataFormat() |
1692 maskTrailingOnes<uint64_t>(32); // Size
1693 SDLoc DL(Addr);
1694
1695 const SITargetLowering& Lowering =
1696 *static_cast<const SITargetLowering*>(getTargetLowering());
1697
1698 SRsrc = SDValue(Lowering.buildRSRC(*CurDAG, DL, Ptr, 0, Rsrc), 0);
1699 return true;
1700 }
1701 return false;
1702}
1703
1704bool AMDGPUDAGToDAGISel::SelectBUFSOffset(SDValue ByteOffsetNode,
1705 SDValue &SOffset) const {
1706 if (Subtarget->hasRestrictedSOffset() && isNullConstant(ByteOffsetNode)) {
1707 SOffset = CurDAG->getRegister(AMDGPU::SGPR_NULL, MVT::i32);
1708 return true;
1709 }
1710
1711 SOffset = ByteOffsetNode;
1712 return true;
1713}
1714
1715// Find a load or store from corresponding pattern root.
1716// Roots may be build_vector, bitconvert or their combinations.
1719 if (MemSDNode *MN = dyn_cast<MemSDNode>(N))
1720 return MN;
1721 assert(isa<BuildVectorSDNode>(N));
1722 for (SDValue V : N->op_values())
1723 if (MemSDNode *MN =
1724 dyn_cast<MemSDNode>(AMDGPUTargetLowering::stripBitcast(V)))
1725 return MN;
1726 llvm_unreachable("cannot find MemSDNode in the pattern!");
1727}
1728
1729bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr,
1730 SDValue &VAddr, SDValue &Offset,
1731 uint64_t FlatVariant) const {
1732 int64_t OffsetVal = 0;
1733
1734 unsigned AS = findMemSDNode(N)->getAddressSpace();
1735
1736 bool CanHaveFlatSegmentOffsetBug =
1737 Subtarget->hasFlatSegmentOffsetBug() &&
1738 FlatVariant == SIInstrFlags::FLAT &&
1740
1741 if (Subtarget->hasFlatInstOffsets() && !CanHaveFlatSegmentOffsetBug) {
1742 SDValue N0, N1;
1743 if (isBaseWithConstantOffset64(Addr, N0, N1) &&
1744 (FlatVariant != SIInstrFlags::FlatScratch ||
1745 isFlatScratchBaseLegal(Addr))) {
1746 int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
1747
1748 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1749 if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) {
1750 Addr = N0;
1751 OffsetVal = COffsetVal;
1752 } else {
1753 // If the offset doesn't fit, put the low bits into the offset field and
1754 // add the rest.
1755 //
1756 // For a FLAT instruction the hardware decides whether to access
1757 // global/scratch/shared memory based on the high bits of vaddr,
1758 // ignoring the offset field, so we have to ensure that when we add
1759 // remainder to vaddr it still points into the same underlying object.
1760 // The easiest way to do that is to make sure that we split the offset
1761 // into two pieces that are both >= 0 or both <= 0.
1762
1763 SDLoc DL(N);
1764 uint64_t RemainderOffset;
1765
1766 std::tie(OffsetVal, RemainderOffset) =
1767 TII->splitFlatOffset(COffsetVal, AS, FlatVariant);
1768
1769 SDValue AddOffsetLo =
1770 getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
1771 SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
1772
1773 if (Addr.getValueType().getSizeInBits() == 32) {
1775 Opnds.push_back(N0);
1776 Opnds.push_back(AddOffsetLo);
1777 unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32;
1778 if (Subtarget->hasAddNoCarry()) {
1779 AddOp = AMDGPU::V_ADD_U32_e64;
1780 Opnds.push_back(Clamp);
1781 }
1782 Addr = SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0);
1783 } else {
1784 // TODO: Should this try to use a scalar add pseudo if the base address
1785 // is uniform and saddr is usable?
1786 SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
1787 SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
1788
1789 SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1790 DL, MVT::i32, N0, Sub0);
1791 SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1792 DL, MVT::i32, N0, Sub1);
1793
1794 SDValue AddOffsetHi =
1795 getMaterializedScalarImm32(Hi_32(RemainderOffset), DL);
1796
1797 SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1);
1798
1799 SDNode *Add =
1800 CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs,
1801 {AddOffsetLo, SDValue(N0Lo, 0), Clamp});
1802
1803 SDNode *Addc = CurDAG->getMachineNode(
1804 AMDGPU::V_ADDC_U32_e64, DL, VTs,
1805 {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp});
1806
1807 SDValue RegSequenceArgs[] = {
1808 CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32),
1809 SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1};
1810
1811 Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
1812 MVT::i64, RegSequenceArgs),
1813 0);
1814 }
1815 }
1816 }
1817 }
1818
1819 VAddr = Addr;
1820 Offset = CurDAG->getSignedTargetConstant(OffsetVal, SDLoc(), MVT::i32);
1821 return true;
1822}
1823
1824bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N, SDValue Addr,
1825 SDValue &VAddr,
1826 SDValue &Offset) const {
1827 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, SIInstrFlags::FLAT);
1828}
1829
1830bool AMDGPUDAGToDAGISel::SelectGlobalOffset(SDNode *N, SDValue Addr,
1831 SDValue &VAddr,
1832 SDValue &Offset) const {
1833 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, SIInstrFlags::FlatGlobal);
1834}
1835
1836bool AMDGPUDAGToDAGISel::SelectScratchOffset(SDNode *N, SDValue Addr,
1837 SDValue &VAddr,
1838 SDValue &Offset) const {
1839 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset,
1841}
1842
1843// If this matches zero_extend i32:x, return x
1845 if (Op.getOpcode() != ISD::ZERO_EXTEND)
1846 return SDValue();
1847
1848 SDValue ExtSrc = Op.getOperand(0);
1849 return (ExtSrc.getValueType() == MVT::i32) ? ExtSrc : SDValue();
1850}
1851
1852// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
1853bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
1854 SDValue Addr,
1855 SDValue &SAddr,
1856 SDValue &VOffset,
1857 SDValue &Offset) const {
1858 int64_t ImmOffset = 0;
1859
1860 // Match the immediate offset first, which canonically is moved as low as
1861 // possible.
1862
1863 SDValue LHS, RHS;
1864 if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
1865 int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
1866 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1867
1868 if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS,
1870 Addr = LHS;
1871 ImmOffset = COffsetVal;
1872 } else if (!LHS->isDivergent()) {
1873 if (COffsetVal > 0) {
1874 SDLoc SL(N);
1875 // saddr + large_offset -> saddr +
1876 // (voffset = large_offset & ~MaxOffset) +
1877 // (large_offset & MaxOffset);
1878 int64_t SplitImmOffset, RemainderOffset;
1879 std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
1881
1882 if (isUInt<32>(RemainderOffset)) {
1883 SDNode *VMov = CurDAG->getMachineNode(
1884 AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
1885 CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
1886 VOffset = SDValue(VMov, 0);
1887 SAddr = LHS;
1888 Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i32);
1889 return true;
1890 }
1891 }
1892
1893 // We are adding a 64 bit SGPR and a constant. If constant bus limit
1894 // is 1 we would need to perform 1 or 2 extra moves for each half of
1895 // the constant and it is better to do a scalar add and then issue a
1896 // single VALU instruction to materialize zero. Otherwise it is less
1897 // instructions to perform VALU adds with immediates or inline literals.
1898 unsigned NumLiterals =
1899 !TII->isInlineConstant(APInt(32, Lo_32(COffsetVal))) +
1900 !TII->isInlineConstant(APInt(32, Hi_32(COffsetVal)));
1901 if (Subtarget->getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
1902 return false;
1903 }
1904 }
1905
1906 // Match the variable offset.
1907 if (Addr.getOpcode() == ISD::ADD) {
1908 LHS = Addr.getOperand(0);
1909 RHS = Addr.getOperand(1);
1910
1911 if (!LHS->isDivergent()) {
1912 // add (i64 sgpr), (zero_extend (i32 vgpr))
1913 if (SDValue ZextRHS = matchZExtFromI32(RHS)) {
1914 SAddr = LHS;
1915 VOffset = ZextRHS;
1916 }
1917 }
1918
1919 if (!SAddr && !RHS->isDivergent()) {
1920 // add (zero_extend (i32 vgpr)), (i64 sgpr)
1921 if (SDValue ZextLHS = matchZExtFromI32(LHS)) {
1922 SAddr = RHS;
1923 VOffset = ZextLHS;
1924 }
1925 }
1926
1927 if (SAddr) {
1928 Offset = CurDAG->getSignedTargetConstant(ImmOffset, SDLoc(), MVT::i32);
1929 return true;
1930 }
1931 }
1932
1933 if (Addr->isDivergent() || Addr.getOpcode() == ISD::UNDEF ||
1934 isa<ConstantSDNode>(Addr))
1935 return false;
1936
1937 // It's cheaper to materialize a single 32-bit zero for vaddr than the two
1938 // moves required to copy a 64-bit SGPR to VGPR.
1939 SAddr = Addr;
1940 SDNode *VMov =
1941 CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, SDLoc(Addr), MVT::i32,
1942 CurDAG->getTargetConstant(0, SDLoc(), MVT::i32));
1943 VOffset = SDValue(VMov, 0);
1944 Offset = CurDAG->getSignedTargetConstant(ImmOffset, SDLoc(), MVT::i32);
1945 return true;
1946}
1947
1949 if (auto *FI = dyn_cast<FrameIndexSDNode>(SAddr)) {
1950 SAddr = CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0));
1951 } else if (SAddr.getOpcode() == ISD::ADD &&
1952 isa<FrameIndexSDNode>(SAddr.getOperand(0))) {
1953 // Materialize this into a scalar move for scalar address to avoid
1954 // readfirstlane.
1955 auto *FI = cast<FrameIndexSDNode>(SAddr.getOperand(0));
1956 SDValue TFI = CurDAG->getTargetFrameIndex(FI->getIndex(),
1957 FI->getValueType(0));
1958 SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, SDLoc(SAddr),
1959 MVT::i32, TFI, SAddr.getOperand(1)),
1960 0);
1961 }
1962
1963 return SAddr;
1964}
1965
1966// Match (32-bit SGPR base) + sext(imm offset)
1967bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr,
1968 SDValue &SAddr,
1969 SDValue &Offset) const {
1970 if (Addr->isDivergent())
1971 return false;
1972
1973 SDLoc DL(Addr);
1974
1975 int64_t COffsetVal = 0;
1976
1977 if (CurDAG->isBaseWithConstantOffset(Addr) && isFlatScratchBaseLegal(Addr)) {
1978 COffsetVal = cast<ConstantSDNode>(Addr.getOperand(1))->getSExtValue();
1979 SAddr = Addr.getOperand(0);
1980 } else {
1981 SAddr = Addr;
1982 }
1983
1984 SAddr = SelectSAddrFI(CurDAG, SAddr);
1985
1986 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1987
1988 if (!TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS,
1990 int64_t SplitImmOffset, RemainderOffset;
1991 std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
1993
1994 COffsetVal = SplitImmOffset;
1995
1996 SDValue AddOffset =
1998 ? getMaterializedScalarImm32(Lo_32(RemainderOffset), DL)
1999 : CurDAG->getSignedTargetConstant(RemainderOffset, DL, MVT::i32);
2000 SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, DL, MVT::i32,
2001 SAddr, AddOffset),
2002 0);
2003 }
2004
2005 Offset = CurDAG->getSignedTargetConstant(COffsetVal, DL, MVT::i32);
2006
2007 return true;
2008}
2009
2010// Check whether the flat scratch SVS swizzle bug affects this access.
2011bool AMDGPUDAGToDAGISel::checkFlatScratchSVSSwizzleBug(
2012 SDValue VAddr, SDValue SAddr, uint64_t ImmOffset) const {
2013 if (!Subtarget->hasFlatScratchSVSSwizzleBug())
2014 return false;
2015
2016 // The bug affects the swizzling of SVS accesses if there is any carry out
2017 // from the two low order bits (i.e. from bit 1 into bit 2) when adding
2018 // voffset to (soffset + inst_offset).
2019 KnownBits VKnown = CurDAG->computeKnownBits(VAddr);
2020 KnownBits SKnown =
2022 KnownBits::makeConstant(APInt(32, ImmOffset,
2023 /*isSigned=*/true)));
2024 uint64_t VMax = VKnown.getMaxValue().getZExtValue();
2026 return (VMax & 3) + (SMax & 3) >= 4;
2027}
2028
2029bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
2030 SDValue &VAddr, SDValue &SAddr,
2031 SDValue &Offset) const {
2032 int64_t ImmOffset = 0;
2033
2034 SDValue LHS, RHS;
2035 SDValue OrigAddr = Addr;
2036 if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
2037 int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
2038 const SIInstrInfo *TII = Subtarget->getInstrInfo();
2039
2040 if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, true)) {
2041 Addr = LHS;
2042 ImmOffset = COffsetVal;
2043 } else if (!LHS->isDivergent() && COffsetVal > 0) {
2044 SDLoc SL(N);
2045 // saddr + large_offset -> saddr + (vaddr = large_offset & ~MaxOffset) +
2046 // (large_offset & MaxOffset);
2047 int64_t SplitImmOffset, RemainderOffset;
2048 std::tie(SplitImmOffset, RemainderOffset)
2049 = TII->splitFlatOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, true);
2050
2051 if (isUInt<32>(RemainderOffset)) {
2052 SDNode *VMov = CurDAG->getMachineNode(
2053 AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
2054 CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
2055 VAddr = SDValue(VMov, 0);
2056 SAddr = LHS;
2057 if (!isFlatScratchBaseLegal(Addr))
2058 return false;
2059 if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, SplitImmOffset))
2060 return false;
2061 Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i32);
2062 return true;
2063 }
2064 }
2065 }
2066
2067 if (Addr.getOpcode() != ISD::ADD)
2068 return false;
2069
2070 LHS = Addr.getOperand(0);
2071 RHS = Addr.getOperand(1);
2072
2073 if (!LHS->isDivergent() && RHS->isDivergent()) {
2074 SAddr = LHS;
2075 VAddr = RHS;
2076 } else if (!RHS->isDivergent() && LHS->isDivergent()) {
2077 SAddr = RHS;
2078 VAddr = LHS;
2079 } else {
2080 return false;
2081 }
2082
2083 if (OrigAddr != Addr) {
2084 if (!isFlatScratchBaseLegalSVImm(OrigAddr))
2085 return false;
2086 } else {
2087 if (!isFlatScratchBaseLegalSV(OrigAddr))
2088 return false;
2089 }
2090
2091 if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, ImmOffset))
2092 return false;
2093 SAddr = SelectSAddrFI(CurDAG, SAddr);
2094 Offset = CurDAG->getSignedTargetConstant(ImmOffset, SDLoc(), MVT::i32);
2095 return true;
2096}
2097
2098// For unbuffered smem loads, it is illegal for the Immediate Offset to be
2099// negative if the resulting (Offset + (M0 or SOffset or zero) is negative.
2100// Handle the case where the Immediate Offset + SOffset is negative.
2101bool AMDGPUDAGToDAGISel::isSOffsetLegalWithImmOffset(SDValue *SOffset,
2102 bool Imm32Only,
2103 bool IsBuffer,
2104 int64_t ImmOffset) const {
2105 if (!IsBuffer && !Imm32Only && ImmOffset < 0 &&
2106 AMDGPU::hasSMRDSignedImmOffset(*Subtarget)) {
2107 KnownBits SKnown = CurDAG->computeKnownBits(*SOffset);
2108 if (ImmOffset + SKnown.getMinValue().getSExtValue() < 0)
2109 return false;
2110 }
2111
2112 return true;
2113}
2114
2115// Match an immediate (if Offset is not null) or an SGPR (if SOffset is
2116// not null) offset. If Imm32Only is true, match only 32-bit immediate
2117// offsets available on CI.
2118bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
2119 SDValue *SOffset, SDValue *Offset,
2120 bool Imm32Only, bool IsBuffer,
2121 bool HasSOffset,
2122 int64_t ImmOffset) const {
2123 assert((!SOffset || !Offset) &&
2124 "Cannot match both soffset and offset at the same time!");
2125
2126 ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode);
2127 if (!C) {
2128 if (!SOffset)
2129 return false;
2130
2131 if (ByteOffsetNode.getValueType().isScalarInteger() &&
2132 ByteOffsetNode.getValueType().getSizeInBits() == 32) {
2133 *SOffset = ByteOffsetNode;
2134 return isSOffsetLegalWithImmOffset(SOffset, Imm32Only, IsBuffer,
2135 ImmOffset);
2136 }
2137 if (ByteOffsetNode.getOpcode() == ISD::ZERO_EXTEND) {
2138 if (ByteOffsetNode.getOperand(0).getValueType().getSizeInBits() == 32) {
2139 *SOffset = ByteOffsetNode.getOperand(0);
2140 return isSOffsetLegalWithImmOffset(SOffset, Imm32Only, IsBuffer,
2141 ImmOffset);
2142 }
2143 }
2144 return false;
2145 }
2146
2147 SDLoc SL(ByteOffsetNode);
2148
2149 // GFX9 and GFX10 have signed byte immediate offsets. The immediate
2150 // offset for S_BUFFER instructions is unsigned.
2151 int64_t ByteOffset = IsBuffer ? C->getZExtValue() : C->getSExtValue();
2152 std::optional<int64_t> EncodedOffset = AMDGPU::getSMRDEncodedOffset(
2153 *Subtarget, ByteOffset, IsBuffer, HasSOffset);
2154 if (EncodedOffset && Offset && !Imm32Only) {
2155 *Offset = CurDAG->getSignedTargetConstant(*EncodedOffset, SL, MVT::i32);
2156 return true;
2157 }
2158
2159 // SGPR and literal offsets are unsigned.
2160 if (ByteOffset < 0)
2161 return false;
2162
2163 EncodedOffset = AMDGPU::getSMRDEncodedLiteralOffset32(*Subtarget, ByteOffset);
2164 if (EncodedOffset && Offset && Imm32Only) {
2165 *Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);
2166 return true;
2167 }
2168
2169 if (!isUInt<32>(ByteOffset) && !isInt<32>(ByteOffset))
2170 return false;
2171
2172 if (SOffset) {
2173 SDValue C32Bit = CurDAG->getTargetConstant(ByteOffset, SL, MVT::i32);
2174 *SOffset = SDValue(
2175 CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, C32Bit), 0);
2176 return true;
2177 }
2178
2179 return false;
2180}
2181
2182SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const {
2183 if (Addr.getValueType() != MVT::i32)
2184 return Addr;
2185
2186 // Zero-extend a 32-bit address.
2187 SDLoc SL(Addr);
2188
2191 unsigned AddrHiVal = Info->get32BitAddressHighBits();
2192 SDValue AddrHi = CurDAG->getTargetConstant(AddrHiVal, SL, MVT::i32);
2193
2194 const SDValue Ops[] = {
2195 CurDAG->getTargetConstant(AMDGPU::SReg_64_XEXECRegClassID, SL, MVT::i32),
2196 Addr,
2197 CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
2198 SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, AddrHi),
2199 0),
2200 CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32),
2201 };
2202
2203 return SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, SL, MVT::i64,
2204 Ops), 0);
2205}
2206
2207// Match a base and an immediate (if Offset is not null) or an SGPR (if
2208// SOffset is not null) or an immediate+SGPR offset. If Imm32Only is
2209// true, match only 32-bit immediate offsets available on CI.
2210bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase,
2211 SDValue *SOffset, SDValue *Offset,
2212 bool Imm32Only, bool IsBuffer,
2213 bool HasSOffset,
2214 int64_t ImmOffset) const {
2215 if (SOffset && Offset) {
2216 assert(!Imm32Only && !IsBuffer);
2217 SDValue B;
2218
2219 if (!SelectSMRDBaseOffset(Addr, B, nullptr, Offset, false, false, true))
2220 return false;
2221
2222 int64_t ImmOff = 0;
2223 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(*Offset))
2224 ImmOff = C->getSExtValue();
2225
2226 return SelectSMRDBaseOffset(B, SBase, SOffset, nullptr, false, false, true,
2227 ImmOff);
2228 }
2229
2230 // A 32-bit (address + offset) should not cause unsigned 32-bit integer
2231 // wraparound, because s_load instructions perform the addition in 64 bits.
2232 if (Addr.getValueType() == MVT::i32 && Addr.getOpcode() == ISD::ADD &&
2233 !Addr->getFlags().hasNoUnsignedWrap())
2234 return false;
2235
2236 SDValue N0, N1;
2237 // Extract the base and offset if possible.
2238 if (CurDAG->isBaseWithConstantOffset(Addr) || Addr.getOpcode() == ISD::ADD) {
2239 N0 = Addr.getOperand(0);
2240 N1 = Addr.getOperand(1);
2241 } else if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, N0, N1)) {
2242 assert(N0 && N1 && isa<ConstantSDNode>(N1));
2243 }
2244 if (!N0 || !N1)
2245 return false;
2246
2247 if (SelectSMRDOffset(N1, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
2248 ImmOffset)) {
2249 SBase = N0;
2250 return true;
2251 }
2252 if (SelectSMRDOffset(N0, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
2253 ImmOffset)) {
2254 SBase = N1;
2255 return true;
2256 }
2257 return false;
2258}
2259
2260bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase,
2261 SDValue *SOffset, SDValue *Offset,
2262 bool Imm32Only) const {
2263 if (SelectSMRDBaseOffset(Addr, SBase, SOffset, Offset, Imm32Only)) {
2264 SBase = Expand32BitAddress(SBase);
2265 return true;
2266 }
2267
2268 if (Addr.getValueType() == MVT::i32 && Offset && !SOffset) {
2269 SBase = Expand32BitAddress(Addr);
2270 *Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
2271 return true;
2272 }
2273
2274 return false;
2275}
2276
2277bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr, SDValue &SBase,
2278 SDValue &Offset) const {
2279 return SelectSMRD(Addr, SBase, /* SOffset */ nullptr, &Offset);
2280}
2281
2282bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase,
2283 SDValue &Offset) const {
2285 return SelectSMRD(Addr, SBase, /* SOffset */ nullptr, &Offset,
2286 /* Imm32Only */ true);
2287}
2288
2289bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDValue Addr, SDValue &SBase,
2290 SDValue &SOffset) const {
2291 return SelectSMRD(Addr, SBase, &SOffset, /* Offset */ nullptr);
2292}
2293
2294bool AMDGPUDAGToDAGISel::SelectSMRDSgprImm(SDValue Addr, SDValue &SBase,
2295 SDValue &SOffset,
2296 SDValue &Offset) const {
2297 return SelectSMRD(Addr, SBase, &SOffset, &Offset);
2298}
2299
2300bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue N, SDValue &Offset) const {
2301 return SelectSMRDOffset(N, /* SOffset */ nullptr, &Offset,
2302 /* Imm32Only */ false, /* IsBuffer */ true);
2303}
2304
2305bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue N,
2306 SDValue &Offset) const {
2308 return SelectSMRDOffset(N, /* SOffset */ nullptr, &Offset,
2309 /* Imm32Only */ true, /* IsBuffer */ true);
2310}
2311
2312bool AMDGPUDAGToDAGISel::SelectSMRDBufferSgprImm(SDValue N, SDValue &SOffset,
2313 SDValue &Offset) const {
2314 // Match the (soffset + offset) pair as a 32-bit register base and
2315 // an immediate offset.
2316 return N.getValueType() == MVT::i32 &&
2317 SelectSMRDBaseOffset(N, /* SBase */ SOffset, /* SOffset*/ nullptr,
2318 &Offset, /* Imm32Only */ false,
2319 /* IsBuffer */ true);
2320}
2321
2322bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index,
2323 SDValue &Base,
2324 SDValue &Offset) const {
2325 SDLoc DL(Index);
2326
2327 if (CurDAG->isBaseWithConstantOffset(Index)) {
2328 SDValue N0 = Index.getOperand(0);
2329 SDValue N1 = Index.getOperand(1);
2330 ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
2331
2332 // (add n0, c0)
2333 // Don't peel off the offset (c0) if doing so could possibly lead
2334 // the base (n0) to be negative.
2335 // (or n0, |c0|) can never change a sign given isBaseWithConstantOffset.
2336 if (C1->getSExtValue() <= 0 || CurDAG->SignBitIsZero(N0) ||
2337 (Index->getOpcode() == ISD::OR && C1->getSExtValue() >= 0)) {
2338 Base = N0;
2339 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
2340 return true;
2341 }
2342 }
2343
2344 if (isa<ConstantSDNode>(Index))
2345 return false;
2346
2347 Base = Index;
2348 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
2349 return true;
2350}
2351
2352SDNode *AMDGPUDAGToDAGISel::getBFE32(bool IsSigned, const SDLoc &DL,
2353 SDValue Val, uint32_t Offset,
2354 uint32_t Width) {
2355 if (Val->isDivergent()) {
2356 unsigned Opcode = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2358 SDValue W = CurDAG->getTargetConstant(Width, DL, MVT::i32);
2359
2360 return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, Off, W);
2361 }
2362 unsigned Opcode = IsSigned ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2363 // Transformation function, pack the offset and width of a BFE into
2364 // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
2365 // source, bits [5:0] contain the offset and bits [22:16] the width.
2366 uint32_t PackedVal = Offset | (Width << 16);
2367 SDValue PackedConst = CurDAG->getTargetConstant(PackedVal, DL, MVT::i32);
2368
2369 return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, PackedConst);
2370}
2371
2372void AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode *N) {
2373 // "(a << b) srl c)" ---> "BFE_U32 a, (c-b), (32-c)
2374 // "(a << b) sra c)" ---> "BFE_I32 a, (c-b), (32-c)
2375 // Predicate: 0 < b <= c < 32
2376
2377 const SDValue &Shl = N->getOperand(0);
2378 ConstantSDNode *B = dyn_cast<ConstantSDNode>(Shl->getOperand(1));
2379 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
2380
2381 if (B && C) {
2382 uint32_t BVal = B->getZExtValue();
2383 uint32_t CVal = C->getZExtValue();
2384
2385 if (0 < BVal && BVal <= CVal && CVal < 32) {
2386 bool Signed = N->getOpcode() == ISD::SRA;
2387 ReplaceNode(N, getBFE32(Signed, SDLoc(N), Shl.getOperand(0), CVal - BVal,
2388 32 - CVal));
2389 return;
2390 }
2391 }
2392 SelectCode(N);
2393}
2394
2395void AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) {
2396 switch (N->getOpcode()) {
2397 case ISD::AND:
2398 if (N->getOperand(0).getOpcode() == ISD::SRL) {
2399 // "(a srl b) & mask" ---> "BFE_U32 a, b, popcount(mask)"
2400 // Predicate: isMask(mask)
2401 const SDValue &Srl = N->getOperand(0);
2402 ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(Srl.getOperand(1));
2403 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(N->getOperand(1));
2404
2405 if (Shift && Mask) {
2406 uint32_t ShiftVal = Shift->getZExtValue();
2407 uint32_t MaskVal = Mask->getZExtValue();
2408
2409 if (isMask_32(MaskVal)) {
2410 uint32_t WidthVal = llvm::popcount(MaskVal);
2411 ReplaceNode(N, getBFE32(false, SDLoc(N), Srl.getOperand(0), ShiftVal,
2412 WidthVal));
2413 return;
2414 }
2415 }
2416 }
2417 break;
2418 case ISD::SRL:
2419 if (N->getOperand(0).getOpcode() == ISD::AND) {
2420 // "(a & mask) srl b)" ---> "BFE_U32 a, b, popcount(mask >> b)"
2421 // Predicate: isMask(mask >> b)
2422 const SDValue &And = N->getOperand(0);
2423 ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(N->getOperand(1));
2424 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(And->getOperand(1));
2425
2426 if (Shift && Mask) {
2427 uint32_t ShiftVal = Shift->getZExtValue();
2428 uint32_t MaskVal = Mask->getZExtValue() >> ShiftVal;
2429
2430 if (isMask_32(MaskVal)) {
2431 uint32_t WidthVal = llvm::popcount(MaskVal);
2432 ReplaceNode(N, getBFE32(false, SDLoc(N), And.getOperand(0), ShiftVal,
2433 WidthVal));
2434 return;
2435 }
2436 }
2437 } else if (N->getOperand(0).getOpcode() == ISD::SHL) {
2438 SelectS_BFEFromShifts(N);
2439 return;
2440 }
2441 break;
2442 case ISD::SRA:
2443 if (N->getOperand(0).getOpcode() == ISD::SHL) {
2444 SelectS_BFEFromShifts(N);
2445 return;
2446 }
2447 break;
2448
2450 // sext_inreg (srl x, 16), i8 -> bfe_i32 x, 16, 8
2451 SDValue Src = N->getOperand(0);
2452 if (Src.getOpcode() != ISD::SRL)
2453 break;
2454
2455 const ConstantSDNode *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(1));
2456 if (!Amt)
2457 break;
2458
2459 unsigned Width = cast<VTSDNode>(N->getOperand(1))->getVT().getSizeInBits();
2460 ReplaceNode(N, getBFE32(true, SDLoc(N), Src.getOperand(0),
2461 Amt->getZExtValue(), Width));
2462 return;
2463 }
2464 }
2465
2466 SelectCode(N);
2467}
2468
2469bool AMDGPUDAGToDAGISel::isCBranchSCC(const SDNode *N) const {
2470 assert(N->getOpcode() == ISD::BRCOND);
2471 if (!N->hasOneUse())
2472 return false;
2473
2474 SDValue Cond = N->getOperand(1);
2475 if (Cond.getOpcode() == ISD::CopyToReg)
2476 Cond = Cond.getOperand(2);
2477
2478 if (Cond.getOpcode() != ISD::SETCC || !Cond.hasOneUse())
2479 return false;
2480
2481 MVT VT = Cond.getOperand(0).getSimpleValueType();
2482 if (VT == MVT::i32)
2483 return true;
2484
2485 if (VT == MVT::i64) {
2486 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
2487 return (CC == ISD::SETEQ || CC == ISD::SETNE) &&
2488 Subtarget->hasScalarCompareEq64();
2489 }
2490
2491 if ((VT == MVT::f16 || VT == MVT::f32) && Subtarget->hasSALUFloatInsts())
2492 return true;
2493
2494 return false;
2495}
2496
2497static SDValue combineBallotPattern(SDValue VCMP, bool &Negate) {
2498 assert(VCMP->getOpcode() == AMDGPUISD::SETCC);
2499 // Special case for amdgcn.ballot:
2500 // %Cond = i1 (and/or combination of i1 ISD::SETCCs)
2501 // %VCMP = i(WaveSize) AMDGPUISD::SETCC (ext %Cond), 0, setne/seteq
2502 // =>
2503 // Use i1 %Cond value instead of i(WaveSize) %VCMP.
2504 // This is possible because divergent ISD::SETCC is selected as V_CMP and
2505 // Cond becomes a i(WaveSize) full mask value.
2506 // Note that ballot doesn't use SETEQ condition but its easy to support it
2507 // here for completeness, so in this case Negate is set true on return.
2508 auto VCMP_CC = cast<CondCodeSDNode>(VCMP.getOperand(2))->get();
2509 if ((VCMP_CC == ISD::SETEQ || VCMP_CC == ISD::SETNE) &&
2510 isNullConstant(VCMP.getOperand(1))) {
2511
2512 auto Cond = VCMP.getOperand(0);
2513 if (ISD::isExtOpcode(Cond->getOpcode())) // Skip extension.
2514 Cond = Cond.getOperand(0);
2515
2516 if (isBoolSGPR(Cond)) {
2517 Negate = VCMP_CC == ISD::SETEQ;
2518 return Cond;
2519 }
2520 }
2521 return SDValue();
2522}
2523
2524void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
2525 SDValue Cond = N->getOperand(1);
2526
2527 if (Cond.isUndef()) {
2528 CurDAG->SelectNodeTo(N, AMDGPU::SI_BR_UNDEF, MVT::Other,
2529 N->getOperand(2), N->getOperand(0));
2530 return;
2531 }
2532
2533 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
2534
2535 bool UseSCCBr = isCBranchSCC(N) && isUniformBr(N);
2536 bool AndExec = !UseSCCBr;
2537 bool Negate = false;
2538
2539 if (Cond.getOpcode() == ISD::SETCC &&
2540 Cond->getOperand(0)->getOpcode() == AMDGPUISD::SETCC) {
2541 SDValue VCMP = Cond->getOperand(0);
2542 auto CC = cast<CondCodeSDNode>(Cond->getOperand(2))->get();
2543 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
2544 isNullConstant(Cond->getOperand(1)) &&
2545 // We may encounter ballot.i64 in wave32 mode on -O0.
2546 VCMP.getValueType().getSizeInBits() == Subtarget->getWavefrontSize()) {
2547 // %VCMP = i(WaveSize) AMDGPUISD::SETCC ...
2548 // %C = i1 ISD::SETCC %VCMP, 0, setne/seteq
2549 // BRCOND i1 %C, %BB
2550 // =>
2551 // %VCMP = i(WaveSize) AMDGPUISD::SETCC ...
2552 // VCC = COPY i(WaveSize) %VCMP
2553 // S_CBRANCH_VCCNZ/VCCZ %BB
2554 Negate = CC == ISD::SETEQ;
2555 bool NegatedBallot = false;
2556 if (auto BallotCond = combineBallotPattern(VCMP, NegatedBallot)) {
2557 Cond = BallotCond;
2558 UseSCCBr = !BallotCond->isDivergent();
2559 Negate = Negate ^ NegatedBallot;
2560 } else {
2561 // TODO: don't use SCC here assuming that AMDGPUISD::SETCC is always
2562 // selected as V_CMP, but this may change for uniform condition.
2563 Cond = VCMP;
2564 UseSCCBr = false;
2565 }
2566 }
2567 // Cond is either V_CMP resulted from AMDGPUISD::SETCC or a combination of
2568 // V_CMPs resulted from ballot or ballot has uniform condition and SCC is
2569 // used.
2570 AndExec = false;
2571 }
2572
2573 unsigned BrOp =
2574 UseSCCBr ? (Negate ? AMDGPU::S_CBRANCH_SCC0 : AMDGPU::S_CBRANCH_SCC1)
2575 : (Negate ? AMDGPU::S_CBRANCH_VCCZ : AMDGPU::S_CBRANCH_VCCNZ);
2576 Register CondReg = UseSCCBr ? AMDGPU::SCC : TRI->getVCC();
2577 SDLoc SL(N);
2578
2579 if (AndExec) {
2580 // This is the case that we are selecting to S_CBRANCH_VCCNZ. We have not
2581 // analyzed what generates the vcc value, so we do not know whether vcc
2582 // bits for disabled lanes are 0. Thus we need to mask out bits for
2583 // disabled lanes.
2584 //
2585 // For the case that we select S_CBRANCH_SCC1 and it gets
2586 // changed to S_CBRANCH_VCCNZ in SIFixSGPRCopies, SIFixSGPRCopies calls
2587 // SIInstrInfo::moveToVALU which inserts the S_AND).
2588 //
2589 // We could add an analysis of what generates the vcc value here and omit
2590 // the S_AND when is unnecessary. But it would be better to add a separate
2591 // pass after SIFixSGPRCopies to do the unnecessary S_AND removal, so it
2592 // catches both cases.
2593 Cond = SDValue(
2595 Subtarget->isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64, SL,
2596 MVT::i1,
2597 CurDAG->getRegister(Subtarget->isWave32() ? AMDGPU::EXEC_LO
2598 : AMDGPU::EXEC,
2599 MVT::i1),
2600 Cond),
2601 0);
2602 }
2603
2604 SDValue VCC = CurDAG->getCopyToReg(N->getOperand(0), SL, CondReg, Cond);
2605 CurDAG->SelectNodeTo(N, BrOp, MVT::Other,
2606 N->getOperand(2), // Basic Block
2607 VCC.getValue(0));
2608}
2609
2610void AMDGPUDAGToDAGISel::SelectFP_EXTEND(SDNode *N) {
2611 if (Subtarget->hasSALUFloatInsts() && N->getValueType(0) == MVT::f32 &&
2612 !N->isDivergent()) {
2613 SDValue Src = N->getOperand(0);
2614 if (Src.getValueType() == MVT::f16) {
2615 if (isExtractHiElt(Src, Src)) {
2616 CurDAG->SelectNodeTo(N, AMDGPU::S_CVT_HI_F32_F16, N->getVTList(),
2617 {Src});
2618 return;
2619 }
2620 }
2621 }
2622
2623 SelectCode(N);
2624}
2625
2626void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode *N, unsigned IntrID) {
2627 // The address is assumed to be uniform, so if it ends up in a VGPR, it will
2628 // be copied to an SGPR with readfirstlane.
2629 unsigned Opc = IntrID == Intrinsic::amdgcn_ds_append ?
2630 AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
2631
2632 SDValue Chain = N->getOperand(0);
2633 SDValue Ptr = N->getOperand(2);
2634 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2635 MachineMemOperand *MMO = M->getMemOperand();
2636 bool IsGDS = M->getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
2637
2640 SDValue PtrBase = Ptr.getOperand(0);
2641 SDValue PtrOffset = Ptr.getOperand(1);
2642
2643 const APInt &OffsetVal = PtrOffset->getAsAPIntVal();
2644 if (isDSOffsetLegal(PtrBase, OffsetVal.getZExtValue())) {
2645 N = glueCopyToM0(N, PtrBase);
2646 Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i32);
2647 }
2648 }
2649
2650 if (!Offset) {
2651 N = glueCopyToM0(N, Ptr);
2652 Offset = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32);
2653 }
2654
2655 SDValue Ops[] = {
2656 Offset,
2657 CurDAG->getTargetConstant(IsGDS, SDLoc(), MVT::i32),
2658 Chain,
2659 N->getOperand(N->getNumOperands() - 1) // New glue
2660 };
2661
2662 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
2663 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
2664}
2665
2666// We need to handle this here because tablegen doesn't support matching
2667// instructions with multiple outputs.
2668void AMDGPUDAGToDAGISel::SelectDSBvhStackIntrinsic(SDNode *N) {
2669 unsigned Opc = AMDGPU::DS_BVH_STACK_RTN_B32;
2670 SDValue Ops[] = {N->getOperand(2), N->getOperand(3), N->getOperand(4),
2671 N->getOperand(5), N->getOperand(0)};
2672
2673 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2674 MachineMemOperand *MMO = M->getMemOperand();
2675 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
2676 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
2677}
2678
2679static unsigned gwsIntrinToOpcode(unsigned IntrID) {
2680 switch (IntrID) {
2681 case Intrinsic::amdgcn_ds_gws_init:
2682 return AMDGPU::DS_GWS_INIT;
2683 case Intrinsic::amdgcn_ds_gws_barrier:
2684 return AMDGPU::DS_GWS_BARRIER;
2685 case Intrinsic::amdgcn_ds_gws_sema_v:
2686 return AMDGPU::DS_GWS_SEMA_V;
2687 case Intrinsic::amdgcn_ds_gws_sema_br:
2688 return AMDGPU::DS_GWS_SEMA_BR;
2689 case Intrinsic::amdgcn_ds_gws_sema_p:
2690 return AMDGPU::DS_GWS_SEMA_P;
2691 case Intrinsic::amdgcn_ds_gws_sema_release_all:
2692 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
2693 default:
2694 llvm_unreachable("not a gws intrinsic");
2695 }
2696}
2697
2698void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) {
2699 if (!Subtarget->hasGWS() ||
2700 (IntrID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
2701 !Subtarget->hasGWSSemaReleaseAll())) {
2702 // Let this error.
2703 SelectCode(N);
2704 return;
2705 }
2706
2707 // Chain, intrinsic ID, vsrc, offset
2708 const bool HasVSrc = N->getNumOperands() == 4;
2709 assert(HasVSrc || N->getNumOperands() == 3);
2710
2711 SDLoc SL(N);
2712 SDValue BaseOffset = N->getOperand(HasVSrc ? 3 : 2);
2713 int ImmOffset = 0;
2714 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2715 MachineMemOperand *MMO = M->getMemOperand();
2716
2717 // Don't worry if the offset ends up in a VGPR. Only one lane will have
2718 // effect, so SIFixSGPRCopies will validly insert readfirstlane.
2719
2720 // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
2721 // offset field) % 64. Some versions of the programming guide omit the m0
2722 // part, or claim it's from offset 0.
2723 if (ConstantSDNode *ConstOffset = dyn_cast<ConstantSDNode>(BaseOffset)) {
2724 // If we have a constant offset, try to use the 0 in m0 as the base.
2725 // TODO: Look into changing the default m0 initialization value. If the
2726 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
2727 // the immediate offset.
2728 glueCopyToM0(N, CurDAG->getTargetConstant(0, SL, MVT::i32));
2729 ImmOffset = ConstOffset->getZExtValue();
2730 } else {
2731 if (CurDAG->isBaseWithConstantOffset(BaseOffset)) {
2732 ImmOffset = BaseOffset.getConstantOperandVal(1);
2733 BaseOffset = BaseOffset.getOperand(0);
2734 }
2735
2736 // Prefer to do the shift in an SGPR since it should be possible to use m0
2737 // as the result directly. If it's already an SGPR, it will be eliminated
2738 // later.
2739 SDNode *SGPROffset
2740 = CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL, MVT::i32,
2741 BaseOffset);
2742 // Shift to offset in m0
2743 SDNode *M0Base
2744 = CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,
2745 SDValue(SGPROffset, 0),
2746 CurDAG->getTargetConstant(16, SL, MVT::i32));
2747 glueCopyToM0(N, SDValue(M0Base, 0));
2748 }
2749
2750 SDValue Chain = N->getOperand(0);
2751 SDValue OffsetField = CurDAG->getTargetConstant(ImmOffset, SL, MVT::i32);
2752
2753 const unsigned Opc = gwsIntrinToOpcode(IntrID);
2755 if (HasVSrc)
2756 Ops.push_back(N->getOperand(2));
2757 Ops.push_back(OffsetField);
2758 Ops.push_back(Chain);
2759
2760 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
2761 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
2762}
2763
2764void AMDGPUDAGToDAGISel::SelectInterpP1F16(SDNode *N) {
2765 if (Subtarget->getLDSBankCount() != 16) {
2766 // This is a single instruction with a pattern.
2767 SelectCode(N);
2768 return;
2769 }
2770
2771 SDLoc DL(N);
2772
2773 // This requires 2 instructions. It is possible to write a pattern to support
2774 // this, but the generated isel emitter doesn't correctly deal with multiple
2775 // output instructions using the same physical register input. The copy to m0
2776 // is incorrectly placed before the second instruction.
2777 //
2778 // TODO: Match source modifiers.
2779 //
2780 // def : Pat <
2781 // (int_amdgcn_interp_p1_f16
2782 // (VOP3Mods f32:$src0, i32:$src0_modifiers),
2783 // (i32 timm:$attrchan), (i32 timm:$attr),
2784 // (i1 timm:$high), M0),
2785 // (V_INTERP_P1LV_F16 $src0_modifiers, VGPR_32:$src0, timm:$attr,
2786 // timm:$attrchan, 0,
2787 // (V_INTERP_MOV_F32 2, timm:$attr, timm:$attrchan), timm:$high)> {
2788 // let Predicates = [has16BankLDS];
2789 // }
2790
2791 // 16 bank LDS
2792 SDValue ToM0 = CurDAG->getCopyToReg(CurDAG->getEntryNode(), DL, AMDGPU::M0,
2793 N->getOperand(5), SDValue());
2794
2795 SDVTList VTs = CurDAG->getVTList(MVT::f32, MVT::Other);
2796
2797 SDNode *InterpMov =
2798 CurDAG->getMachineNode(AMDGPU::V_INTERP_MOV_F32, DL, VTs, {
2799 CurDAG->getTargetConstant(2, DL, MVT::i32), // P0
2800 N->getOperand(3), // Attr
2801 N->getOperand(2), // Attrchan
2802 ToM0.getValue(1) // In glue
2803 });
2804
2805 SDNode *InterpP1LV =
2806 CurDAG->getMachineNode(AMDGPU::V_INTERP_P1LV_F16, DL, MVT::f32, {
2807 CurDAG->getTargetConstant(0, DL, MVT::i32), // $src0_modifiers
2808 N->getOperand(1), // Src0
2809 N->getOperand(3), // Attr
2810 N->getOperand(2), // Attrchan
2811 CurDAG->getTargetConstant(0, DL, MVT::i32), // $src2_modifiers
2812 SDValue(InterpMov, 0), // Src2 - holds two f16 values selected by high
2813 N->getOperand(4), // high
2814 CurDAG->getTargetConstant(0, DL, MVT::i1), // $clamp
2815 CurDAG->getTargetConstant(0, DL, MVT::i32), // $omod
2816 SDValue(InterpMov, 1)
2817 });
2818
2819 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), SDValue(InterpP1LV, 0));
2820}
2821
2822void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
2823 unsigned IntrID = N->getConstantOperandVal(1);
2824 switch (IntrID) {
2825 case Intrinsic::amdgcn_ds_append:
2826 case Intrinsic::amdgcn_ds_consume: {
2827 if (N->getValueType(0) != MVT::i32)
2828 break;
2829 SelectDSAppendConsume(N, IntrID);
2830 return;
2831 }
2832 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2833 SelectDSBvhStackIntrinsic(N);
2834 return;
2835 case Intrinsic::amdgcn_init_whole_wave:
2838 ->setInitWholeWave();
2839 break;
2840 }
2841
2842 SelectCode(N);
2843}
2844
2845void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) {
2846 unsigned IntrID = N->getConstantOperandVal(0);
2847 unsigned Opcode = AMDGPU::INSTRUCTION_LIST_END;
2848 SDNode *ConvGlueNode = N->getGluedNode();
2849 if (ConvGlueNode) {
2850 // FIXME: Possibly iterate over multiple glue nodes?
2851 assert(ConvGlueNode->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
2852 ConvGlueNode = ConvGlueNode->getOperand(0).getNode();
2853 ConvGlueNode =
2854 CurDAG->getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, {},
2855 MVT::Glue, SDValue(ConvGlueNode, 0));
2856 } else {
2857 ConvGlueNode = nullptr;
2858 }
2859 switch (IntrID) {
2860 case Intrinsic::amdgcn_wqm:
2861 Opcode = AMDGPU::WQM;
2862 break;
2863 case Intrinsic::amdgcn_softwqm:
2864 Opcode = AMDGPU::SOFT_WQM;
2865 break;
2866 case Intrinsic::amdgcn_wwm:
2867 case Intrinsic::amdgcn_strict_wwm:
2868 Opcode = AMDGPU::STRICT_WWM;
2869 break;
2870 case Intrinsic::amdgcn_strict_wqm:
2871 Opcode = AMDGPU::STRICT_WQM;
2872 break;
2873 case Intrinsic::amdgcn_interp_p1_f16:
2874 SelectInterpP1F16(N);
2875 return;
2876 case Intrinsic::amdgcn_permlane16_swap:
2877 case Intrinsic::amdgcn_permlane32_swap: {
2878 if ((IntrID == Intrinsic::amdgcn_permlane16_swap &&
2879 !Subtarget->hasPermlane16Swap()) ||
2880 (IntrID == Intrinsic::amdgcn_permlane32_swap &&
2881 !Subtarget->hasPermlane32Swap())) {
2882 SelectCode(N); // Hit the default error
2883 return;
2884 }
2885
2886 Opcode = IntrID == Intrinsic::amdgcn_permlane16_swap
2887 ? AMDGPU::V_PERMLANE16_SWAP_B32_e64
2888 : AMDGPU::V_PERMLANE32_SWAP_B32_e64;
2889
2890 SmallVector<SDValue, 4> NewOps(N->op_begin() + 1, N->op_end());
2891 if (ConvGlueNode)
2892 NewOps.push_back(SDValue(ConvGlueNode, 0));
2893
2894 bool FI = N->getConstantOperandVal(3);
2895 NewOps[2] = CurDAG->getTargetConstant(
2897
2898 CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), NewOps);
2899 return;
2900 }
2901 default:
2902 SelectCode(N);
2903 break;
2904 }
2905
2906 if (Opcode != AMDGPU::INSTRUCTION_LIST_END) {
2907 SDValue Src = N->getOperand(1);
2908 CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), {Src});
2909 }
2910
2911 if (ConvGlueNode) {
2912 SmallVector<SDValue, 4> NewOps(N->ops());
2913 NewOps.push_back(SDValue(ConvGlueNode, 0));
2914 CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), NewOps);
2915 }
2916}
2917
2918void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) {
2919 unsigned IntrID = N->getConstantOperandVal(1);
2920 switch (IntrID) {
2921 case Intrinsic::amdgcn_ds_gws_init:
2922 case Intrinsic::amdgcn_ds_gws_barrier:
2923 case Intrinsic::amdgcn_ds_gws_sema_v:
2924 case Intrinsic::amdgcn_ds_gws_sema_br:
2925 case Intrinsic::amdgcn_ds_gws_sema_p:
2926 case Intrinsic::amdgcn_ds_gws_sema_release_all:
2927 SelectDS_GWS(N, IntrID);
2928 return;
2929 default:
2930 break;
2931 }
2932
2933 SelectCode(N);
2934}
2935
2936void AMDGPUDAGToDAGISel::SelectWAVE_ADDRESS(SDNode *N) {
2937 SDValue Log2WaveSize =
2938 CurDAG->getTargetConstant(Subtarget->getWavefrontSizeLog2(), SDLoc(N), MVT::i32);
2939 CurDAG->SelectNodeTo(N, AMDGPU::S_LSHR_B32, N->getVTList(),
2940 {N->getOperand(0), Log2WaveSize});
2941}
2942
2943void AMDGPUDAGToDAGISel::SelectSTACKRESTORE(SDNode *N) {
2944 SDValue SrcVal = N->getOperand(1);
2945 if (SrcVal.getValueType() != MVT::i32) {
2946 SelectCode(N); // Emit default error
2947 return;
2948 }
2949
2950 SDValue CopyVal;
2952 SDLoc SL(N);
2953
2954 if (SrcVal.getOpcode() == AMDGPUISD::WAVE_ADDRESS) {
2955 CopyVal = SrcVal.getOperand(0);
2956 } else {
2957 SDValue Log2WaveSize = CurDAG->getTargetConstant(
2958 Subtarget->getWavefrontSizeLog2(), SL, MVT::i32);
2959
2960 if (N->isDivergent()) {
2961 SrcVal = SDValue(CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL,
2962 MVT::i32, SrcVal),
2963 0);
2964 }
2965
2966 CopyVal = SDValue(CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,
2967 {SrcVal, Log2WaveSize}),
2968 0);
2969 }
2970
2971 SDValue CopyToSP = CurDAG->getCopyToReg(N->getOperand(0), SL, SP, CopyVal);
2972 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), CopyToSP);
2973}
2974
2975bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src,
2976 unsigned &Mods,
2977 bool IsCanonicalizing,
2978 bool AllowAbs) const {
2979 Mods = SISrcMods::NONE;
2980 Src = In;
2981
2982 if (Src.getOpcode() == ISD::FNEG) {
2983 Mods |= SISrcMods::NEG;
2984 Src = Src.getOperand(0);
2985 } else if (Src.getOpcode() == ISD::FSUB && IsCanonicalizing) {
2986 // Fold fsub [+-]0 into fneg. This may not have folded depending on the
2987 // denormal mode, but we're implicitly canonicalizing in a source operand.
2988 auto *LHS = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
2989 if (LHS && LHS->isZero()) {
2990 Mods |= SISrcMods::NEG;
2991 Src = Src.getOperand(1);
2992 }
2993 }
2994
2995 if (AllowAbs && Src.getOpcode() == ISD::FABS) {
2996 Mods |= SISrcMods::ABS;
2997 Src = Src.getOperand(0);
2998 }
2999
3000 return true;
3001}
3002
3003bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src,
3004 SDValue &SrcMods) const {
3005 unsigned Mods;
3006 if (SelectVOP3ModsImpl(In, Src, Mods, /*IsCanonicalizing=*/true,
3007 /*AllowAbs=*/true)) {
3008 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3009 return true;
3010 }
3011
3012 return false;
3013}
3014
3015bool AMDGPUDAGToDAGISel::SelectVOP3ModsNonCanonicalizing(
3016 SDValue In, SDValue &Src, SDValue &SrcMods) const {
3017 unsigned Mods;
3018 if (SelectVOP3ModsImpl(In, Src, Mods, /*IsCanonicalizing=*/false,
3019 /*AllowAbs=*/true)) {
3020 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3021 return true;
3022 }
3023
3024 return false;
3025}
3026
3027bool AMDGPUDAGToDAGISel::SelectVOP3BMods(SDValue In, SDValue &Src,
3028 SDValue &SrcMods) const {
3029 unsigned Mods;
3030 if (SelectVOP3ModsImpl(In, Src, Mods,
3031 /*IsCanonicalizing=*/true,
3032 /*AllowAbs=*/false)) {
3033 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3034 return true;
3035 }
3036
3037 return false;
3038}
3039
3040bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src) const {
3041 if (In.getOpcode() == ISD::FABS || In.getOpcode() == ISD::FNEG)
3042 return false;
3043
3044 Src = In;
3045 return true;
3046}
3047
3048bool AMDGPUDAGToDAGISel::SelectVINTERPModsImpl(SDValue In, SDValue &Src,
3049 SDValue &SrcMods,
3050 bool OpSel) const {
3051 unsigned Mods;
3052 if (SelectVOP3ModsImpl(In, Src, Mods,
3053 /*IsCanonicalizing=*/true,
3054 /*AllowAbs=*/false)) {
3055 if (OpSel)
3056 Mods |= SISrcMods::OP_SEL_0;
3057 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3058 return true;
3059 }
3060
3061 return false;
3062}
3063
3064bool AMDGPUDAGToDAGISel::SelectVINTERPMods(SDValue In, SDValue &Src,
3065 SDValue &SrcMods) const {
3066 return SelectVINTERPModsImpl(In, Src, SrcMods, /* OpSel */ false);
3067}
3068
3069bool AMDGPUDAGToDAGISel::SelectVINTERPModsHi(SDValue In, SDValue &Src,
3070 SDValue &SrcMods) const {
3071 return SelectVINTERPModsImpl(In, Src, SrcMods, /* OpSel */ true);
3072}
3073
3074bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src,
3075 SDValue &SrcMods, SDValue &Clamp,
3076 SDValue &Omod) const {
3077 SDLoc DL(In);
3078 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
3079 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
3080
3081 return SelectVOP3Mods(In, Src, SrcMods);
3082}
3083
3084bool AMDGPUDAGToDAGISel::SelectVOP3BMods0(SDValue In, SDValue &Src,
3085 SDValue &SrcMods, SDValue &Clamp,
3086 SDValue &Omod) const {
3087 SDLoc DL(In);
3088 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
3089 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
3090
3091 return SelectVOP3BMods(In, Src, SrcMods);
3092}
3093
3094bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In, SDValue &Src,
3095 SDValue &Clamp, SDValue &Omod) const {
3096 Src = In;
3097
3098 SDLoc DL(In);
3099 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
3100 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
3101
3102 return true;
3103}
3104
3105bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
3106 SDValue &SrcMods, bool IsDOT) const {
3107 unsigned Mods = SISrcMods::NONE;
3108 Src = In;
3109
3110 // TODO: Handle G_FSUB 0 as fneg
3111 if (Src.getOpcode() == ISD::FNEG) {
3113 Src = Src.getOperand(0);
3114 }
3115
3116 if (Src.getOpcode() == ISD::BUILD_VECTOR && Src.getNumOperands() == 2 &&
3117 (!IsDOT || !Subtarget->hasDOTOpSelHazard())) {
3118 unsigned VecMods = Mods;
3119
3120 SDValue Lo = stripBitcast(Src.getOperand(0));
3121 SDValue Hi = stripBitcast(Src.getOperand(1));
3122
3123 if (Lo.getOpcode() == ISD::FNEG) {
3124 Lo = stripBitcast(Lo.getOperand(0));
3125 Mods ^= SISrcMods::NEG;
3126 }
3127
3128 if (Hi.getOpcode() == ISD::FNEG) {
3129 Hi = stripBitcast(Hi.getOperand(0));
3130 Mods ^= SISrcMods::NEG_HI;
3131 }
3132
3133 if (isExtractHiElt(Lo, Lo))
3134 Mods |= SISrcMods::OP_SEL_0;
3135
3136 if (isExtractHiElt(Hi, Hi))
3137 Mods |= SISrcMods::OP_SEL_1;
3138
3139 unsigned VecSize = Src.getValueSizeInBits();
3140 Lo = stripExtractLoElt(Lo);
3141 Hi = stripExtractLoElt(Hi);
3142
3143 if (Lo.getValueSizeInBits() > VecSize) {
3145 (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
3146 MVT::getIntegerVT(VecSize), Lo);
3147 }
3148
3149 if (Hi.getValueSizeInBits() > VecSize) {
3151 (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
3152 MVT::getIntegerVT(VecSize), Hi);
3153 }
3154
3155 assert(Lo.getValueSizeInBits() <= VecSize &&
3156 Hi.getValueSizeInBits() <= VecSize);
3157
3158 if (Lo == Hi && !isInlineImmediate(Lo.getNode())) {
3159 // Really a scalar input. Just select from the low half of the register to
3160 // avoid packing.
3161
3162 if (VecSize == 32 || VecSize == Lo.getValueSizeInBits()) {
3163 Src = Lo;
3164 } else {
3165 assert(Lo.getValueSizeInBits() == 32 && VecSize == 64);
3166
3167 SDLoc SL(In);
3169 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, SL,
3170 Lo.getValueType()), 0);
3171 auto RC = Lo->isDivergent() ? AMDGPU::VReg_64RegClassID
3172 : AMDGPU::SReg_64RegClassID;
3173 const SDValue Ops[] = {
3174 CurDAG->getTargetConstant(RC, SL, MVT::i32),
3175 Lo, CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
3176 Undef, CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32) };
3177
3178 Src = SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, SL,
3179 Src.getValueType(), Ops), 0);
3180 }
3181 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3182 return true;
3183 }
3184
3185 if (VecSize == 64 && Lo == Hi && isa<ConstantFPSDNode>(Lo)) {
3186 uint64_t Lit = cast<ConstantFPSDNode>(Lo)->getValueAPF()
3187 .bitcastToAPInt().getZExtValue();
3188 if (AMDGPU::isInlinableLiteral32(Lit, Subtarget->hasInv2PiInlineImm())) {
3189 Src = CurDAG->getTargetConstant(Lit, SDLoc(In), MVT::i64);
3190 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3191 return true;
3192 }
3193 }
3194
3195 Mods = VecMods;
3196 } else if (Src.getOpcode() == ISD::VECTOR_SHUFFLE &&
3197 Src.getNumOperands() == 2) {
3198
3199 // TODO: We should repeat the build_vector source check above for the
3200 // vector_shuffle for negates and casts of individual elements.
3201
3202 auto *SVN = cast<ShuffleVectorSDNode>(Src);
3203 ArrayRef<int> Mask = SVN->getMask();
3204
3205 if (Mask[0] < 2 && Mask[1] < 2) {
3206 // src1 should be undef.
3207 SDValue ShuffleSrc = SVN->getOperand(0);
3208
3209 if (ShuffleSrc.getOpcode() == ISD::FNEG) {
3210 ShuffleSrc = ShuffleSrc.getOperand(0);
3212 }
3213
3214 if (Mask[0] == 1)
3215 Mods |= SISrcMods::OP_SEL_0;
3216 if (Mask[1] == 1)
3217 Mods |= SISrcMods::OP_SEL_1;
3218
3219 Src = ShuffleSrc;
3220 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3221 return true;
3222 }
3223 }
3224
3225 // Packed instructions do not have abs modifiers.
3226 Mods |= SISrcMods::OP_SEL_1;
3227
3228 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3229 return true;
3230}
3231
3232bool AMDGPUDAGToDAGISel::SelectVOP3PModsDOT(SDValue In, SDValue &Src,
3233 SDValue &SrcMods) const {
3234 return SelectVOP3PMods(In, Src, SrcMods, true);
3235}
3236
3237bool AMDGPUDAGToDAGISel::SelectVOP3PModsNeg(SDValue In, SDValue &Src) const {
3238 const ConstantSDNode *C = cast<ConstantSDNode>(In);
3239 // Literal i1 value set in intrinsic, represents SrcMods for the next operand.
3240 // 1 promotes packed values to signed, 0 treats them as unsigned.
3241 assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value");
3242
3243 unsigned Mods = SISrcMods::OP_SEL_1;
3244 unsigned SrcSign = C->getZExtValue();
3245 if (SrcSign == 1)
3246 Mods ^= SISrcMods::NEG;
3247
3248 Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3249 return true;
3250}
3251
3252bool AMDGPUDAGToDAGISel::SelectWMMAOpSelVOP3PMods(SDValue In,
3253 SDValue &Src) const {
3254 const ConstantSDNode *C = cast<ConstantSDNode>(In);
3255 assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value");
3256
3257 unsigned Mods = SISrcMods::OP_SEL_1;
3258 unsigned SrcVal = C->getZExtValue();
3259 if (SrcVal == 1)
3260 Mods |= SISrcMods::OP_SEL_0;
3261
3262 Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3263 return true;
3264}
3265
3267 llvm::SelectionDAG *CurDAG,
3268 const SDLoc &DL) {
3269 unsigned DstRegClass;
3270 EVT DstTy;
3271 switch (Elts.size()) {
3272 case 8:
3273 DstRegClass = AMDGPU::VReg_256RegClassID;
3274 DstTy = MVT::v8i32;
3275 break;
3276 case 4:
3277 DstRegClass = AMDGPU::VReg_128RegClassID;
3278 DstTy = MVT::v4i32;
3279 break;
3280 case 2:
3281 DstRegClass = AMDGPU::VReg_64RegClassID;
3282 DstTy = MVT::v2i32;
3283 break;
3284 default:
3285 llvm_unreachable("unhandled Reg sequence size");
3286 }
3287
3289 Ops.push_back(CurDAG->getTargetConstant(DstRegClass, DL, MVT::i32));
3290 for (unsigned i = 0; i < Elts.size(); ++i) {
3291 Ops.push_back(Elts[i]);
3292 Ops.push_back(CurDAG->getTargetConstant(
3294 }
3295 return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, DstTy, Ops);
3296}
3297
3299 llvm::SelectionDAG *CurDAG,
3300 const SDLoc &DL) {
3301 SmallVector<SDValue, 8> PackedElts;
3302 assert("unhandled Reg sequence size" &&
3303 (Elts.size() == 8 || Elts.size() == 16));
3304
3305 // Pack 16-bit elements in pairs into 32-bit register. If both elements are
3306 // unpacked from 32-bit source use it, otherwise pack them using v_perm.
3307 for (unsigned i = 0; i < Elts.size(); i += 2) {
3308 SDValue LoSrc = stripExtractLoElt(stripBitcast(Elts[i]));
3309 SDValue HiSrc;
3310 if (isExtractHiElt(Elts[i + 1], HiSrc) && LoSrc == HiSrc) {
3311 PackedElts.push_back(HiSrc);
3312 } else {
3313 SDValue PackLoLo = CurDAG->getTargetConstant(0x05040100, DL, MVT::i32);
3314 MachineSDNode *Packed =
3315 CurDAG->getMachineNode(AMDGPU::V_PERM_B32_e64, DL, MVT::i32,
3316 {Elts[i + 1], Elts[i], PackLoLo});
3317 PackedElts.push_back(SDValue(Packed, 0));
3318 }
3319 }
3320
3321 return buildRegSequence32(PackedElts, CurDAG, DL);
3322}
3323
3325 llvm::SelectionDAG *CurDAG,
3326 const SDLoc &DL, unsigned ElementSize) {
3327 if (ElementSize == 16)
3328 return buildRegSequence16(Elts, CurDAG, DL);
3329 if (ElementSize == 32)
3330 return buildRegSequence32(Elts, CurDAG, DL);
3331 llvm_unreachable("Unhandled element size");
3332}
3333
3334static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods,
3336 llvm::SelectionDAG *CurDAG, const SDLoc &DL,
3337 unsigned ElementSize) {
3338 if (ModOpcode == ISD::FNEG) {
3339 Mods |= SISrcMods::NEG;
3340 // Check if all elements also have abs modifier
3341 SmallVector<SDValue, 8> NegAbsElts;
3342 for (auto El : Elts) {
3343 if (El.getOpcode() != ISD::FABS)
3344 break;
3345 NegAbsElts.push_back(El->getOperand(0));
3346 }
3347 if (Elts.size() != NegAbsElts.size()) {
3348 // Neg
3349 Src = SDValue(buildRegSequence(Elts, CurDAG, DL, ElementSize), 0);
3350 } else {
3351 // Neg and Abs
3352 Mods |= SISrcMods::NEG_HI;
3353 Src = SDValue(buildRegSequence(NegAbsElts, CurDAG, DL, ElementSize), 0);
3354 }
3355 } else {
3356 assert(ModOpcode == ISD::FABS);
3357 // Abs
3358 Mods |= SISrcMods::NEG_HI;
3359 Src = SDValue(buildRegSequence(Elts, CurDAG, DL, ElementSize), 0);
3360 }
3361}
3362
3363// Check all f16 elements for modifiers while looking through b32 and v2b16
3364// build vector, stop if element does not satisfy ModifierCheck.
3365static void
3367 std::function<bool(SDValue)> ModifierCheck) {
3368 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3369 if (auto *F16Pair =
3370 dyn_cast<BuildVectorSDNode>(stripBitcast(BV->getOperand(i)))) {
3371 for (unsigned i = 0; i < F16Pair->getNumOperands(); ++i) {
3372 SDValue ElF16 = stripBitcast(F16Pair->getOperand(i));
3373 if (!ModifierCheck(ElF16))
3374 break;
3375 }
3376 }
3377 }
3378}
3379
3380bool AMDGPUDAGToDAGISel::SelectWMMAModsF16Neg(SDValue In, SDValue &Src,
3381 SDValue &SrcMods) const {
3382 Src = In;
3383 unsigned Mods = SISrcMods::OP_SEL_1;
3384
3385 // mods are on f16 elements
3386 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3388
3389 checkWMMAElementsModifiersF16(BV, [&](SDValue Element) -> bool {
3390 if (Element.getOpcode() != ISD::FNEG)
3391 return false;
3392 EltsF16.push_back(Element.getOperand(0));
3393 return true;
3394 });
3395
3396 // All elements have neg modifier
3397 if (BV->getNumOperands() * 2 == EltsF16.size()) {
3398 Src = SDValue(buildRegSequence16(EltsF16, CurDAG, SDLoc(In)), 0);
3399 Mods |= SISrcMods::NEG;
3400 Mods |= SISrcMods::NEG_HI;
3401 }
3402 }
3403
3404 // mods are on v2f16 elements
3405 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3406 SmallVector<SDValue, 8> EltsV2F16;
3407 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3408 SDValue ElV2f16 = stripBitcast(BV->getOperand(i));
3409 // Based on first element decide which mod we match, neg or abs
3410 if (ElV2f16.getOpcode() != ISD::FNEG)
3411 break;
3412 EltsV2F16.push_back(ElV2f16.getOperand(0));
3413 }
3414
3415 // All pairs of elements have neg modifier
3416 if (BV->getNumOperands() == EltsV2F16.size()) {
3417 Src = SDValue(buildRegSequence32(EltsV2F16, CurDAG, SDLoc(In)), 0);
3418 Mods |= SISrcMods::NEG;
3419 Mods |= SISrcMods::NEG_HI;
3420 }
3421 }
3422
3423 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3424 return true;
3425}
3426
3427bool AMDGPUDAGToDAGISel::SelectWMMAModsF16NegAbs(SDValue In, SDValue &Src,
3428 SDValue &SrcMods) const {
3429 Src = In;
3430 unsigned Mods = SISrcMods::OP_SEL_1;
3431 unsigned ModOpcode;
3432
3433 // mods are on f16 elements
3434 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3436 checkWMMAElementsModifiersF16(BV, [&](SDValue ElF16) -> bool {
3437 // Based on first element decide which mod we match, neg or abs
3438 if (EltsF16.empty())
3439 ModOpcode = (ElF16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3440 if (ElF16.getOpcode() != ModOpcode)
3441 return false;
3442 EltsF16.push_back(ElF16.getOperand(0));
3443 return true;
3444 });
3445
3446 // All elements have ModOpcode modifier
3447 if (BV->getNumOperands() * 2 == EltsF16.size())
3448 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF16, Src, CurDAG, SDLoc(In),
3449 16);
3450 }
3451
3452 // mods are on v2f16 elements
3453 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3454 SmallVector<SDValue, 8> EltsV2F16;
3455
3456 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3457 SDValue ElV2f16 = stripBitcast(BV->getOperand(i));
3458 // Based on first element decide which mod we match, neg or abs
3459 if (EltsV2F16.empty())
3460 ModOpcode = (ElV2f16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3461 if (ElV2f16->getOpcode() != ModOpcode)
3462 break;
3463 EltsV2F16.push_back(ElV2f16->getOperand(0));
3464 }
3465
3466 // All elements have ModOpcode modifier
3467 if (BV->getNumOperands() == EltsV2F16.size())
3468 selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, CurDAG, SDLoc(In),
3469 32);
3470 }
3471
3472 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3473 return true;
3474}
3475
3476bool AMDGPUDAGToDAGISel::SelectWMMAModsF32NegAbs(SDValue In, SDValue &Src,
3477 SDValue &SrcMods) const {
3478 Src = In;
3479 unsigned Mods = SISrcMods::OP_SEL_1;
3481
3482 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3483 assert(BV->getNumOperands() > 0);
3484 // Based on first element decide which mod we match, neg or abs
3485 SDValue ElF32 = stripBitcast(BV->getOperand(0));
3486 unsigned ModOpcode =
3487 (ElF32.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3488 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3489 SDValue ElF32 = stripBitcast(BV->getOperand(i));
3490 if (ElF32.getOpcode() != ModOpcode)
3491 break;
3492 EltsF32.push_back(ElF32.getOperand(0));
3493 }
3494
3495 // All elements had ModOpcode modifier
3496 if (BV->getNumOperands() == EltsF32.size())
3497 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, CurDAG, SDLoc(In),
3498 32);
3499 }
3500
3501 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3502 return true;
3503}
3504
3505bool AMDGPUDAGToDAGISel::SelectWMMAVISrc(SDValue In, SDValue &Src) const {
3506 if (auto *BV = dyn_cast<BuildVectorSDNode>(In)) {
3507 BitVector UndefElements;
3508 if (SDValue Splat = BV->getSplatValue(&UndefElements))
3509 if (isInlineImmediate(Splat.getNode())) {
3510 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Splat)) {
3511 unsigned Imm = C->getAPIntValue().getSExtValue();
3512 Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32);
3513 return true;
3514 }
3515 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Splat)) {
3516 unsigned Imm = C->getValueAPF().bitcastToAPInt().getSExtValue();
3517 Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32);
3518 return true;
3519 }
3520 llvm_unreachable("unhandled Constant node");
3521 }
3522 }
3523
3524 // 16 bit splat
3525 SDValue SplatSrc32 = stripBitcast(In);
3526 if (auto *SplatSrc32BV = dyn_cast<BuildVectorSDNode>(SplatSrc32))
3527 if (SDValue Splat32 = SplatSrc32BV->getSplatValue()) {
3528 SDValue SplatSrc16 = stripBitcast(Splat32);
3529 if (auto *SplatSrc16BV = dyn_cast<BuildVectorSDNode>(SplatSrc16))
3530 if (SDValue Splat = SplatSrc16BV->getSplatValue()) {
3531 const SIInstrInfo *TII = Subtarget->getInstrInfo();
3532 std::optional<APInt> RawValue;
3533 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Splat))
3534 RawValue = C->getValueAPF().bitcastToAPInt();
3535 else if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Splat))
3536 RawValue = C->getAPIntValue();
3537
3538 if (RawValue.has_value()) {
3539 EVT VT = In.getValueType().getScalarType();
3540 if (VT.getSimpleVT() == MVT::f16 || VT.getSimpleVT() == MVT::bf16) {
3541 APFloat FloatVal(VT.getSimpleVT() == MVT::f16
3544 RawValue.value());
3545 if (TII->isInlineConstant(FloatVal)) {
3546 Src = CurDAG->getTargetConstant(RawValue.value(), SDLoc(In),
3547 MVT::i16);
3548 return true;
3549 }
3550 } else if (VT.getSimpleVT() == MVT::i16) {
3551 if (TII->isInlineConstant(RawValue.value())) {
3552 Src = CurDAG->getTargetConstant(RawValue.value(), SDLoc(In),
3553 MVT::i16);
3554 return true;
3555 }
3556 } else
3557 llvm_unreachable("unknown 16-bit type");
3558 }
3559 }
3560 }
3561
3562 return false;
3563}
3564
3565bool AMDGPUDAGToDAGISel::SelectSWMMACIndex8(SDValue In, SDValue &Src,
3566 SDValue &IndexKey) const {
3567 unsigned Key = 0;
3568 Src = In;
3569
3570 if (In.getOpcode() == ISD::SRL) {
3571 const llvm::SDValue &ShiftSrc = In.getOperand(0);
3572 ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(In.getOperand(1));
3573 if (ShiftSrc.getValueType().getSizeInBits() == 32 && ShiftAmt &&
3574 ShiftAmt->getZExtValue() % 8 == 0) {
3575 Key = ShiftAmt->getZExtValue() / 8;
3576 Src = ShiftSrc;
3577 }
3578 }
3579
3580 IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);
3581 return true;
3582}
3583
3584bool AMDGPUDAGToDAGISel::SelectSWMMACIndex16(SDValue In, SDValue &Src,
3585 SDValue &IndexKey) const {
3586 unsigned Key = 0;
3587 Src = In;
3588
3589 if (In.getOpcode() == ISD::SRL) {
3590 const llvm::SDValue &ShiftSrc = In.getOperand(0);
3591 ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(In.getOperand(1));
3592 if (ShiftSrc.getValueType().getSizeInBits() == 32 && ShiftAmt &&
3593 ShiftAmt->getZExtValue() == 16) {
3594 Key = 1;
3595 Src = ShiftSrc;
3596 }
3597 }
3598
3599 IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);
3600 return true;
3601}
3602
3603bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src,
3604 SDValue &SrcMods) const {
3605 Src = In;
3606 // FIXME: Handle op_sel
3607 SrcMods = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32);
3608 return true;
3609}
3610
3611bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods(SDValue In, SDValue &Src,
3612 SDValue &SrcMods) const {
3613 // FIXME: Handle op_sel
3614 return SelectVOP3Mods(In, Src, SrcMods);
3615}
3616
3617// The return value is not whether the match is possible (which it always is),
3618// but whether or not it a conversion is really used.
3619bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src,
3620 unsigned &Mods) const {
3621 Mods = 0;
3622 SelectVOP3ModsImpl(In, Src, Mods);
3623
3624 if (Src.getOpcode() == ISD::FP_EXTEND) {
3625 Src = Src.getOperand(0);
3626 assert(Src.getValueType() == MVT::f16);
3627 Src = stripBitcast(Src);
3628
3629 // Be careful about folding modifiers if we already have an abs. fneg is
3630 // applied last, so we don't want to apply an earlier fneg.
3631 if ((Mods & SISrcMods::ABS) == 0) {
3632 unsigned ModsTmp;
3633 SelectVOP3ModsImpl(Src, Src, ModsTmp);
3634
3635 if ((ModsTmp & SISrcMods::NEG) != 0)
3636 Mods ^= SISrcMods::NEG;
3637
3638 if ((ModsTmp & SISrcMods::ABS) != 0)
3639 Mods |= SISrcMods::ABS;
3640 }
3641
3642 // op_sel/op_sel_hi decide the source type and source.
3643 // If the source's op_sel_hi is set, it indicates to do a conversion from fp16.
3644 // If the sources's op_sel is set, it picks the high half of the source
3645 // register.
3646
3647 Mods |= SISrcMods::OP_SEL_1;
3648 if (isExtractHiElt(Src, Src)) {
3649 Mods |= SISrcMods::OP_SEL_0;
3650
3651 // TODO: Should we try to look for neg/abs here?
3652 }
3653
3654 return true;
3655 }
3656
3657 return false;
3658}
3659
3660bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsExt(SDValue In, SDValue &Src,
3661 SDValue &SrcMods) const {
3662 unsigned Mods = 0;
3663 if (!SelectVOP3PMadMixModsImpl(In, Src, Mods))
3664 return false;
3665 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3666 return true;
3667}
3668
3669bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src,
3670 SDValue &SrcMods) const {
3671 unsigned Mods = 0;
3672 SelectVOP3PMadMixModsImpl(In, Src, Mods);
3673 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3674 return true;
3675}
3676
3677// Match BITOP3 operation and return a number of matched instructions plus
3678// truth table.
3679static std::pair<unsigned, uint8_t> BitOp3_Op(SDValue In,
3681 unsigned NumOpcodes = 0;
3682 uint8_t LHSBits, RHSBits;
3683
3684 auto getOperandBits = [&Src, In](SDValue Op, uint8_t &Bits) -> bool {
3685 // Define truth table given Src0, Src1, Src2 bits permutations:
3686 // 0 0 0
3687 // 0 0 1
3688 // 0 1 0
3689 // 0 1 1
3690 // 1 0 0
3691 // 1 0 1
3692 // 1 1 0
3693 // 1 1 1
3694 const uint8_t SrcBits[3] = { 0xf0, 0xcc, 0xaa };
3695
3696 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
3697 if (C->isAllOnes()) {
3698 Bits = 0xff;
3699 return true;
3700 }
3701 if (C->isZero()) {
3702 Bits = 0;
3703 return true;
3704 }
3705 }
3706
3707 for (unsigned I = 0; I < Src.size(); ++I) {
3708 // Try to find existing reused operand
3709 if (Src[I] == Op) {
3710 Bits = SrcBits[I];
3711 return true;
3712 }
3713 // Try to replace parent operator
3714 if (Src[I] == In) {
3715 Bits = SrcBits[I];
3716 Src[I] = Op;
3717 return true;
3718 }
3719 }
3720
3721 if (Src.size() == 3) {
3722 // No room left for operands. Try one last time, there can be a 'not' of
3723 // one of our source operands. In this case we can compute the bits
3724 // without growing Src vector.
3725 if (Op.getOpcode() == ISD::XOR) {
3726 if (auto *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
3727 if (C->isAllOnes()) {
3728 SDValue LHS = Op.getOperand(0);
3729 for (unsigned I = 0; I < Src.size(); ++I) {
3730 if (Src[I] == LHS) {
3731 Bits = ~SrcBits[I];
3732 return true;
3733 }
3734 }
3735 }
3736 }
3737 }
3738
3739 return false;
3740 }
3741
3742 Bits = SrcBits[Src.size()];
3743 Src.push_back(Op);
3744 return true;
3745 };
3746
3747 switch (In.getOpcode()) {
3748 case ISD::AND:
3749 case ISD::OR:
3750 case ISD::XOR: {
3751 SDValue LHS = In.getOperand(0);
3752 SDValue RHS = In.getOperand(1);
3753
3754 SmallVector<SDValue, 3> Backup(Src.begin(), Src.end());
3755 if (!getOperandBits(LHS, LHSBits) ||
3756 !getOperandBits(RHS, RHSBits)) {
3757 Src = Backup;
3758 return std::make_pair(0, 0);
3759 }
3760
3761 // Recursion is naturally limited by the size of the operand vector.
3762 auto Op = BitOp3_Op(LHS, Src);
3763 if (Op.first) {
3764 NumOpcodes += Op.first;
3765 LHSBits = Op.second;
3766 }
3767
3768 Op = BitOp3_Op(RHS, Src);
3769 if (Op.first) {
3770 NumOpcodes += Op.first;
3771 RHSBits = Op.second;
3772 }
3773 break;
3774 }
3775 default:
3776 return std::make_pair(0, 0);
3777 }
3778
3779 uint8_t TTbl;
3780 switch (In.getOpcode()) {
3781 case ISD::AND:
3782 TTbl = LHSBits & RHSBits;
3783 break;
3784 case ISD::OR:
3785 TTbl = LHSBits | RHSBits;
3786 break;
3787 case ISD::XOR:
3788 TTbl = LHSBits ^ RHSBits;
3789 break;
3790 default:
3791 break;
3792 }
3793
3794 return std::make_pair(NumOpcodes + 1, TTbl);
3795}
3796
3797bool AMDGPUDAGToDAGISel::SelectBITOP3(SDValue In, SDValue &Src0, SDValue &Src1,
3798 SDValue &Src2, SDValue &Tbl) const {
3800 uint8_t TTbl;
3801 unsigned NumOpcodes;
3802
3803 std::tie(NumOpcodes, TTbl) = BitOp3_Op(In, Src);
3804
3805 // Src.empty() case can happen if all operands are all zero or all ones.
3806 // Normally it shall be optimized out before reaching this.
3807 if (NumOpcodes < 2 || Src.empty())
3808 return false;
3809
3810 // For a uniform case threshold should be higher to account for moves between
3811 // VGPRs and SGPRs. It needs one operand in a VGPR, rest two can be in SGPRs
3812 // and a readtfirstlane after.
3813 if (NumOpcodes < 4 && !In->isDivergent())
3814 return false;
3815
3816 if (NumOpcodes == 2 && In.getValueType() == MVT::i32) {
3817 // Avoid using BITOP3 for OR3, XOR3, AND_OR. This is not faster but makes
3818 // asm more readable. This cannot be modeled with AddedComplexity because
3819 // selector does not know how many operations did we match.
3820 if ((In.getOpcode() == ISD::XOR || In.getOpcode() == ISD::OR) &&
3821 (In.getOperand(0).getOpcode() == In.getOpcode() ||
3822 In.getOperand(1).getOpcode() == In.getOpcode()))
3823 return false;
3824
3825 if (In.getOpcode() == ISD::OR &&
3826 (In.getOperand(0).getOpcode() == ISD::AND ||
3827 In.getOperand(1).getOpcode() == ISD::AND))
3828 return false;
3829 }
3830
3831 // Last operand can be ignored, turning a ternary operation into a binary.
3832 // For example: (~a & b & c) | (~a & b & ~c) -> (~a & b). We can replace
3833 // 'c' with 'a' here without changing the answer. In some pathological
3834 // cases it should be possible to get an operation with a single operand
3835 // too if optimizer would not catch it.
3836 while (Src.size() < 3)
3837 Src.push_back(Src[0]);
3838
3839 Src0 = Src[0];
3840 Src1 = Src[1];
3841 Src2 = Src[2];
3842
3843 Tbl = CurDAG->getTargetConstant(TTbl, SDLoc(In), MVT::i32);
3844 return true;
3845}
3846
3847SDValue AMDGPUDAGToDAGISel::getHi16Elt(SDValue In) const {
3848 if (In.isUndef())
3849 return CurDAG->getUNDEF(MVT::i32);
3850
3851 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(In)) {
3852 SDLoc SL(In);
3853 return CurDAG->getConstant(C->getZExtValue() << 16, SL, MVT::i32);
3854 }
3855
3856 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(In)) {
3857 SDLoc SL(In);
3858 return CurDAG->getConstant(
3859 C->getValueAPF().bitcastToAPInt().getZExtValue() << 16, SL, MVT::i32);
3860 }
3861
3862 SDValue Src;
3863 if (isExtractHiElt(In, Src))
3864 return Src;
3865
3866 return SDValue();
3867}
3868
3869bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
3871
3872 const SIRegisterInfo *SIRI =
3873 static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
3874 const SIInstrInfo * SII =
3875 static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
3876
3877 unsigned Limit = 0;
3878 bool AllUsesAcceptSReg = true;
3879 for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
3880 Limit < 10 && U != E; ++U, ++Limit) {
3881 const TargetRegisterClass *RC =
3882 getOperandRegClass(U->getUser(), U->getOperandNo());
3883
3884 // If the register class is unknown, it could be an unknown
3885 // register class that needs to be an SGPR, e.g. an inline asm
3886 // constraint
3887 if (!RC || SIRI->isSGPRClass(RC))
3888 return false;
3889
3890 if (RC != &AMDGPU::VS_32RegClass && RC != &AMDGPU::VS_64RegClass) {
3891 AllUsesAcceptSReg = false;
3892 SDNode *User = U->getUser();
3893 if (User->isMachineOpcode()) {
3894 unsigned Opc = User->getMachineOpcode();
3895 const MCInstrDesc &Desc = SII->get(Opc);
3896 if (Desc.isCommutable()) {
3897 unsigned OpIdx = Desc.getNumDefs() + U->getOperandNo();
3898 unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex;
3899 if (SII->findCommutedOpIndices(Desc, OpIdx, CommuteIdx1)) {
3900 unsigned CommutedOpNo = CommuteIdx1 - Desc.getNumDefs();
3901 const TargetRegisterClass *CommutedRC =
3902 getOperandRegClass(U->getUser(), CommutedOpNo);
3903 if (CommutedRC == &AMDGPU::VS_32RegClass ||
3904 CommutedRC == &AMDGPU::VS_64RegClass)
3905 AllUsesAcceptSReg = true;
3906 }
3907 }
3908 }
3909 // If "AllUsesAcceptSReg == false" so far we haven't succeeded
3910 // commuting current user. This means have at least one use
3911 // that strictly require VGPR. Thus, we will not attempt to commute
3912 // other user instructions.
3913 if (!AllUsesAcceptSReg)
3914 break;
3915 }
3916 }
3917 return !AllUsesAcceptSReg && (Limit < 10);
3918}
3919
3920bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode *N) const {
3921 const auto *Ld = cast<LoadSDNode>(N);
3922
3923 const MachineMemOperand *MMO = Ld->getMemOperand();
3924 if (N->isDivergent() && !AMDGPUInstrInfo::isUniformMMO(MMO))
3925 return false;
3926
3927 return MMO->getSize().hasValue() &&
3928 Ld->getAlign() >=
3929 Align(std::min(MMO->getSize().getValue().getKnownMinValue(),
3930 uint64_t(4))) &&
3931 ((Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
3932 Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ||
3933 (Subtarget->getScalarizeGlobalBehavior() &&
3934 Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
3935 Ld->isSimple() &&
3936 static_cast<const SITargetLowering *>(getTargetLowering())
3937 ->isMemOpHasNoClobberedMemOperand(N)));
3938}
3939
3942 *static_cast<const AMDGPUTargetLowering*>(getTargetLowering());
3943 bool IsModified = false;
3944 do {
3945 IsModified = false;
3946
3947 // Go over all selected nodes and try to fold them a bit more
3949 while (Position != CurDAG->allnodes_end()) {
3950 SDNode *Node = &*Position++;
3951 MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(Node);
3952 if (!MachineNode)
3953 continue;
3954
3955 SDNode *ResNode = Lowering.PostISelFolding(MachineNode, *CurDAG);
3956 if (ResNode != Node) {
3957 if (ResNode)
3958 ReplaceUses(Node, ResNode);
3959 IsModified = true;
3960 }
3961 }
3963 } while (IsModified);
3964}
3965
3967 CodeGenOptLevel OptLevel)
3969 ID, std::make_unique<AMDGPUDAGToDAGISel>(TM, OptLevel)) {}
3970
unsigned const MachineRegisterInfo * MRI
static bool getBaseWithOffsetUsingSplitOR(SelectionDAG &DAG, SDValue Addr, SDValue &N0, SDValue &N1)
static MachineSDNode * buildRegSequence32(SmallVectorImpl< SDValue > &Elts, llvm::SelectionDAG *CurDAG, const SDLoc &DL)
static SDValue matchZExtFromI32(SDValue Op)
static SDValue SelectSAddrFI(SelectionDAG *CurDAG, SDValue SAddr)
static MemSDNode * findMemSDNode(SDNode *N)
static MachineSDNode * buildRegSequence16(SmallVectorImpl< SDValue > &Elts, llvm::SelectionDAG *CurDAG, const SDLoc &DL)
static bool IsCopyFromSGPR(const SIRegisterInfo &TRI, SDValue Val)
static SDValue combineBallotPattern(SDValue VCMP, bool &Negate)
static void checkWMMAElementsModifiersF16(BuildVectorSDNode *BV, std::function< bool(SDValue)> ModifierCheck)
Defines an instruction selector for the AMDGPU target.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In, Register &Out)
static std::pair< unsigned, uint8_t > BitOp3_Op(Register R, SmallVectorImpl< Register > &Src, const MachineRegisterInfo &MRI)
static unsigned gwsIntrinToOpcode(unsigned IntrID)
static Register buildRegSequence(SmallVectorImpl< Register > &Elts, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods, SmallVectorImpl< Register > &Elts, Register &Src, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
Provides AMDGPU specific target descriptions.
Base class for AMDGPU specific classes of TargetSubtarget.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(...)
Definition: Debug.h:106
uint64_t Addr
uint64_t Size
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
unsigned const TargetRegisterInfo * TRI
FunctionAnalysisManager FAM
if(PassOpts->AAPipeline)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:57
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
Provides R600 specific target descriptions.
Interface definition for R600RegisterInfo.
const SmallVectorImpl< MachineOperand > & Cond
SI DAG Lowering interface definition.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
LLVM IR instance of the generic uniformity analysis.
Value * RHS
Value * LHS
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
AMDGPUDAGToDAGISelLegacy(TargetMachine &TM, CodeGenOptLevel OptLevel)
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
StringRef getPassName() const override
getPassName - Return a nice clean name for a pass.
AMDGPU specific code to select AMDGPU machine instructions for SelectionDAG operations.
void SelectBuildVector(SDNode *N, unsigned RegClassID)
void Select(SDNode *N) override
Main hook for targets to transform nodes into machine nodes.
bool runOnMachineFunction(MachineFunction &MF) override
void PreprocessISelDAG() override
PreprocessISelDAG - This hook allows targets to hack on the graph before instruction selection starts...
void PostprocessISelDAG() override
PostprocessISelDAG() - This hook allows the target to hack on the graph right after selection.
bool matchLoadD16FromBuildVector(SDNode *N) const
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
AMDGPUISelDAGToDAGPass(TargetMachine &TM)
static bool isUniformMMO(const MachineMemOperand *MMO)
unsigned getWavefrontSizeLog2() const
unsigned getWavefrontSize() const
bool hasInv2PiInlineImm() const
static SDValue stripBitcast(SDValue Val)
static int64_t getNullPointerValue(unsigned AddrSpace)
Get the integer value of a null pointer in the given address space.
Class for arbitrary precision integers.
Definition: APInt.h:78
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1520
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1542
unsigned countr_one() const
Count the number of trailing one bits.
Definition: APInt.h:1635
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:410
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:240
A "pseudo-class" with methods for operating on BUILD_VECTORs.
SDValue getSplatValue(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted value or a null value if this is not a splat.
uint64_t getZExtValue() const
int64_t getSExtValue() const
This class represents an Operation in the Expression.
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:279
Legacy analysis pass which computes a DominatorTree.
Definition: Dominators.h:317
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:310
bool hasPermlane32Swap() const
bool hasScalarCompareEq64() const
int getLDSBankCount() const
Definition: GCNSubtarget.h:350
bool hasUsableDSOffset() const
True if the offset field of DS instructions works as expected.
Definition: GCNSubtarget.h:478
bool unsafeDSOffsetFoldingEnabled() const
Definition: GCNSubtarget.h:482
bool hasFlatInstOffsets() const
Definition: GCNSubtarget.h:641
bool hasDLInsts() const
Definition: GCNSubtarget.h:779
const SIInstrInfo * getInstrInfo() const override
Definition: GCNSubtarget.h:279
unsigned getConstantBusLimit(unsigned Opcode) const
bool hasMADIntraFwdBug() const
bool hasPkMovB32() const
bool privateMemoryResourceIsRangeChecked() const
Definition: GCNSubtarget.h:563
bool hasSignedScratchOffsets() const
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:291
bool hasDOTOpSelHazard() const
bool d16PreservesUnusedBits() const
Definition: GCNSubtarget.h:706
bool hasRestrictedSOffset() const
bool hasFlatSegmentOffsetBug() const
Definition: GCNSubtarget.h:694
bool getScalarizeGlobalBehavior() const
Definition: GCNSubtarget.h:988
bool ldsRequiresM0Init() const
Return if most LDS instructions have an m0 use that require m0 to be initialized.
Definition: GCNSubtarget.h:716
bool isWave32() const
bool hasPermlane16Swap() const
bool hasFlatScratchSVSSwizzleBug() const
bool hasGWS() const
bool useFlatForGlobal() const
Definition: GCNSubtarget.h:541
Generation getGeneration() const
Definition: GCNSubtarget.h:327
bool hasGWSSemaReleaseAll() const
Definition: GCNSubtarget.h:730
bool hasAddr64() const
Definition: GCNSubtarget.h:391
bool hasAddNoCarry() const
Definition: GCNSubtarget.h:738
bool hasSALUFloatInsts() const
void checkSubtargetFeatures(const Function &F) const
Diagnose inconsistent subtarget features before attempting to codegen function F.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
bool hasValue() const
TypeSize getValue() const
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:566
SmallVector< LoopT *, 4 > getLoopsInPreorder() const
Return all of the loops in the function in preorder across the loop nests, with siblings in forward p...
The legacy pass manager's analysis pass to compute loop information.
Definition: LoopInfo.h:593
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
const Triple & getTargetTriple() const
Machine Value Type.
static MVT getIntegerVT(unsigned BitWidth)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
An SDNode that represents everything that will be needed to construct a MachineInstr.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getChain() const
EVT getMemoryVT() const
Return the type of the in-memory value.
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:111
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
This class provides iterator support for SDUse operands that use a specific SDNode.
Represents one node in the SelectionDAG.
const APInt & getAsAPIntVal() const
Helper method returns the APInt value of a ConstantSDNode.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool isDivergent() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumOperands() const
Return the number of values used by this operation.
const SDValue & getOperand(unsigned Num) const
bool isPredecessorOf(const SDNode *N) const
Return true if this node is a predecessor of N.
static use_iterator use_end()
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
const SDValue & getOperand(unsigned i) const
uint64_t getConstantOperandVal(unsigned i) const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx0, unsigned &SrcOpIdx1) const override
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
const TargetRegisterClass * getRegClass(unsigned RCID) const
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isSGPRClass(const TargetRegisterClass *RC)
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
SelectionDAGISel - This is the common base class used for SelectionDAG-based pattern-matching instruc...
std::unique_ptr< FunctionLoweringInfo > FuncInfo
const TargetLowering * TLI
MachineFunction * MF
const TargetInstrInfo * TII
void ReplaceUses(SDValue F, SDValue T)
ReplaceUses - replace all uses of the old node F with the use of the new node T.
void ReplaceNode(SDNode *F, SDNode *T)
Replace all uses of F with T, then remove F from the DAG.
virtual bool runOnMachineFunction(MachineFunction &mf)
const TargetLowering * getTargetLowering() const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:228
const TargetSubtargetInfo & getSubtarget() const
Definition: SelectionDAG.h:499
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
Definition: SelectionDAG.h:802
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
SDNode * SelectNodeTo(SDNode *N, unsigned MachineOpc, EVT VT)
These are used for target selectors to mutate the specified node to have the specified return type,...
SDValue getRegister(Register Reg, EVT VT)
SDNode * MorphNodeTo(SDNode *N, unsigned Opc, SDVTList VTs, ArrayRef< SDValue > Ops)
This mutates the specified node to have the specified return type, opcode, and operands.
allnodes_const_iterator allnodes_begin() const
Definition: SelectionDAG.h:557
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
allnodes_const_iterator allnodes_end() const
Definition: SelectionDAG.h:558
void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
SDValue getTargetFrameIndex(int FI, EVT VT)
Definition: SelectionDAG.h:756
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:713
bool SignBitIsZero(SDValue Op, unsigned Depth=0) const
Return true if the sign bit of Op is known to be zero.
void RemoveDeadNodes()
This method deletes all unreachable nodes in the SelectionDAG.
void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
const TargetMachine & getTarget() const
Definition: SelectionDAG.h:498
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:701
bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:492
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition: SelectionDAG.h:580
ilist< SDNode >::iterator allnodes_iterator
Definition: SelectionDAG.h:560
bool empty() const
Definition: SmallVector.h:81
size_t size() const
Definition: SmallVector.h:78
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
static const unsigned CommuteAnyOperandIndex
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
const Triple & getTargetTriple() const
unsigned getID() const
Return the register class ID number.
ArchType getArch() const
Get the parsed architecture type of this triple.
Definition: Triple.h:395
Legacy analysis pass which computes a CycleInfo.
LLVM Value Representation.
Definition: Value.h:74
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:168
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
@ CLAMP
CLAMP value between 0.0 and 1.0.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
Key
PAL metadata keys.
std::optional< int64_t > getSMRDEncodedLiteralOffset32(const MCSubtargetInfo &ST, int64_t ByteOffset)
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isValid32BitLiteral(uint64_t Val, bool IsFP64)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST)
std::optional< int64_t > getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset, bool IsBuffer, bool HasSOffset)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:125
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:780
@ STACKRESTORE
STACKRESTORE has two operands, an input chain and a pointer to restore to it returns an output chain.
Definition: ISDOpcodes.h:1197
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:257
@ ConstantFP
Definition: ISDOpcodes.h:77
@ ADDC
Carry-setting nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:276
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition: ISDOpcodes.h:502
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:246
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1102
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:498
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition: ISDOpcodes.h:205
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:841
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:397
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:954
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition: ISDOpcodes.h:236
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
Definition: ISDOpcodes.h:997
@ CONVERGENCECTRL_GLUE
Definition: ISDOpcodes.h:1476
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition: ISDOpcodes.h:635
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:981
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition: ISDOpcodes.h:515
@ UNDEF
UNDEF - An undefined node.
Definition: ISDOpcodes.h:218
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition: ISDOpcodes.h:215
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:735
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:615
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:550
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:209
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:811
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:1031
@ TargetFrameIndex
Definition: ISDOpcodes.h:172
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:849
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:939
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:310
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:709
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:190
@ ADDE
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:286
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:920
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:817
@ BRCOND
BRCOND - Conditional branch.
Definition: ISDOpcodes.h:1141
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:198
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition: ISDOpcodes.h:530
bool isExtOpcode(unsigned Opcode)
Definition: ISDOpcodes.h:1681
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1610
@ Undef
Value of the register doesn't matter.
constexpr const char32_t SBase
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Offset
Definition: DWP.cpp:480
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
constexpr bool isMask_32(uint32_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition: MathExtras.h:268
bool isBoolSGPR(SDValue V)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:155
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
CodeGenOptLevel
Code generation optimization level.
Definition: CodeGen.h:54
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:160
static SDNode * packConstantV2I16(const SDNode *N, SelectionDAG &DAG)
FunctionPass * createAMDGPUISelDag(TargetMachine &TM, CodeGenOptLevel OptLevel)
This pass converts a legalized DAG into a AMDGPU-specific.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ Add
Sum of integers.
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition: VE.h:375
Implement std::hash so that hash_code can be used in STL containers.
Definition: BitVector.h:858
#define N
static const fltSemantics & IEEEhalf() LLVM_READNONE
Definition: APFloat.cpp:255
static const fltSemantics & BFloat() LLVM_READNONE
Definition: APFloat.cpp:256
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Description of the encoding of one expression Op.
Extended Value Type.
Definition: ValueTypes.h:35
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:368
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:380
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:311
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition: ValueTypes.h:251
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:323
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:157
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:331
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition: KnownBits.h:293
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition: KnownBits.h:336
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition: KnownBits.h:137
APInt getMinValue() const
Return the minimal unsigned value possible given these KnownBits.
Definition: KnownBits.h:121
static unsigned getSubRegFromChannel(unsigned Channel)
This represents a list of ValueType's that has been intern'd by a SelectionDAG.