LLVM 19.0.0git
AMDGPUISelDAGToDAG.cpp
Go to the documentation of this file.
1//===-- AMDGPUISelDAGToDAG.cpp - A dag to dag inst selector for AMDGPU ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//==-----------------------------------------------------------------------===//
8//
9/// \file
10/// Defines an instruction selector for the AMDGPU target.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPUISelDAGToDAG.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPUSubtarget.h"
18#include "AMDGPUTargetMachine.h"
21#include "R600RegisterInfo.h"
22#include "SIISelLowering.h"
30#include "llvm/IR/IntrinsicsAMDGPU.h"
33
34#ifdef EXPENSIVE_CHECKS
36#include "llvm/IR/Dominators.h"
37#endif
38
39#define DEBUG_TYPE "amdgpu-isel"
40
41using namespace llvm;
42
43//===----------------------------------------------------------------------===//
44// Instruction Selector Implementation
45//===----------------------------------------------------------------------===//
46
47namespace {
48static SDValue stripBitcast(SDValue Val) {
49 return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val;
50}
51
52// Figure out if this is really an extract of the high 16-bits of a dword.
53static bool isExtractHiElt(SDValue In, SDValue &Out) {
54 In = stripBitcast(In);
55
56 if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
57 if (ConstantSDNode *Idx = dyn_cast<ConstantSDNode>(In.getOperand(1))) {
58 if (!Idx->isOne())
59 return false;
60 Out = In.getOperand(0);
61 return true;
62 }
63 }
64
65 if (In.getOpcode() != ISD::TRUNCATE)
66 return false;
67
68 SDValue Srl = In.getOperand(0);
69 if (Srl.getOpcode() == ISD::SRL) {
70 if (ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
71 if (ShiftAmt->getZExtValue() == 16) {
72 Out = stripBitcast(Srl.getOperand(0));
73 return true;
74 }
75 }
76 }
77
78 return false;
79}
80
81// Look through operations that obscure just looking at the low 16-bits of the
82// same register.
83static SDValue stripExtractLoElt(SDValue In) {
84 if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
85 SDValue Idx = In.getOperand(1);
86 if (isNullConstant(Idx) && In.getValueSizeInBits() <= 32)
87 return In.getOperand(0);
88 }
89
90 if (In.getOpcode() == ISD::TRUNCATE) {
91 SDValue Src = In.getOperand(0);
92 if (Src.getValueType().getSizeInBits() == 32)
93 return stripBitcast(Src);
94 }
95
96 return In;
97}
98
99} // end anonymous namespace
100
102 "AMDGPU DAG->DAG Pattern Instruction Selection", false, false)
106#ifdef EXPENSIVE_CHECKS
109#endif
111 "AMDGPU DAG->DAG Pattern Instruction Selection", false, false)
112
113/// This pass converts a legalized DAG into a AMDGPU-specific
114// DAG, ready for instruction scheduling.
116 CodeGenOptLevel OptLevel) {
117 return new AMDGPUDAGToDAGISel(TM, OptLevel);
118}
119
121 CodeGenOptLevel OptLevel)
122 : SelectionDAGISel(ID, TM, OptLevel) {
123 EnableLateStructurizeCFG = AMDGPUTargetMachine::EnableLateStructurizeCFG;
124}
125
127#ifdef EXPENSIVE_CHECKS
128 DominatorTree & DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
129 LoopInfo * LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
130 for (auto &L : LI->getLoopsInPreorder()) {
131 assert(L->isLCSSAForm(DT));
132 }
133#endif
134 Subtarget = &MF.getSubtarget<GCNSubtarget>();
135 Mode = SIModeRegisterDefaults(MF.getFunction(), *Subtarget);
137}
138
139bool AMDGPUDAGToDAGISel::fp16SrcZerosHighBits(unsigned Opc) const {
140 // XXX - only need to list legal operations.
141 switch (Opc) {
142 case ISD::FADD:
143 case ISD::FSUB:
144 case ISD::FMUL:
145 case ISD::FDIV:
146 case ISD::FREM:
148 case ISD::UINT_TO_FP:
149 case ISD::SINT_TO_FP:
150 case ISD::FABS:
151 // Fabs is lowered to a bit operation, but it's an and which will clear the
152 // high bits anyway.
153 case ISD::FSQRT:
154 case ISD::FSIN:
155 case ISD::FCOS:
156 case ISD::FPOWI:
157 case ISD::FPOW:
158 case ISD::FLOG:
159 case ISD::FLOG2:
160 case ISD::FLOG10:
161 case ISD::FEXP:
162 case ISD::FEXP2:
163 case ISD::FCEIL:
164 case ISD::FTRUNC:
165 case ISD::FRINT:
166 case ISD::FNEARBYINT:
167 case ISD::FROUNDEVEN:
168 case ISD::FROUND:
169 case ISD::FFLOOR:
170 case ISD::FMINNUM:
171 case ISD::FMAXNUM:
172 case ISD::FLDEXP:
173 case AMDGPUISD::FRACT:
174 case AMDGPUISD::CLAMP:
177 case AMDGPUISD::FMIN3:
178 case AMDGPUISD::FMAX3:
179 case AMDGPUISD::FMED3:
181 case AMDGPUISD::RCP:
182 case AMDGPUISD::RSQ:
184 // On gfx10, all 16-bit instructions preserve the high bits.
185 return Subtarget->getGeneration() <= AMDGPUSubtarget::GFX9;
186 case ISD::FP_ROUND:
187 // We may select fptrunc (fma/mad) to mad_mixlo, which does not zero the
188 // high bits on gfx9.
189 // TODO: If we had the source node we could see if the source was fma/mad
191 case ISD::FMA:
192 case ISD::FMAD:
195 default:
196 // fcopysign, select and others may be lowered to 32-bit bit operations
197 // which don't zero the high bits.
198 return false;
199 }
200}
201
205#ifdef EXPENSIVE_CHECKS
208#endif
210}
211
213 assert(Subtarget->d16PreservesUnusedBits());
214 MVT VT = N->getValueType(0).getSimpleVT();
215 if (VT != MVT::v2i16 && VT != MVT::v2f16)
216 return false;
217
218 SDValue Lo = N->getOperand(0);
219 SDValue Hi = N->getOperand(1);
220
221 LoadSDNode *LdHi = dyn_cast<LoadSDNode>(stripBitcast(Hi));
222
223 // build_vector lo, (load ptr) -> load_d16_hi ptr, lo
224 // build_vector lo, (zextload ptr from i8) -> load_d16_hi_u8 ptr, lo
225 // build_vector lo, (sextload ptr from i8) -> load_d16_hi_i8 ptr, lo
226
227 // Need to check for possible indirect dependencies on the other half of the
228 // vector to avoid introducing a cycle.
229 if (LdHi && Hi.hasOneUse() && !LdHi->isPredecessorOf(Lo.getNode())) {
230 SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
231
233 SDValue Ops[] = {
234 LdHi->getChain(), LdHi->getBasePtr(), TiedIn
235 };
236
237 unsigned LoadOp = AMDGPUISD::LOAD_D16_HI;
238 if (LdHi->getMemoryVT() == MVT::i8) {
239 LoadOp = LdHi->getExtensionType() == ISD::SEXTLOAD ?
241 } else {
242 assert(LdHi->getMemoryVT() == MVT::i16);
243 }
244
245 SDValue NewLoadHi =
246 CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdHi), VTList,
247 Ops, LdHi->getMemoryVT(),
248 LdHi->getMemOperand());
249
250 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadHi);
251 CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdHi, 1), NewLoadHi.getValue(1));
252 return true;
253 }
254
255 // build_vector (load ptr), hi -> load_d16_lo ptr, hi
256 // build_vector (zextload ptr from i8), hi -> load_d16_lo_u8 ptr, hi
257 // build_vector (sextload ptr from i8), hi -> load_d16_lo_i8 ptr, hi
258 LoadSDNode *LdLo = dyn_cast<LoadSDNode>(stripBitcast(Lo));
259 if (LdLo && Lo.hasOneUse()) {
260 SDValue TiedIn = getHi16Elt(Hi);
261 if (!TiedIn || LdLo->isPredecessorOf(TiedIn.getNode()))
262 return false;
263
264 SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
265 unsigned LoadOp = AMDGPUISD::LOAD_D16_LO;
266 if (LdLo->getMemoryVT() == MVT::i8) {
267 LoadOp = LdLo->getExtensionType() == ISD::SEXTLOAD ?
269 } else {
270 assert(LdLo->getMemoryVT() == MVT::i16);
271 }
272
273 TiedIn = CurDAG->getNode(ISD::BITCAST, SDLoc(N), VT, TiedIn);
274
275 SDValue Ops[] = {
276 LdLo->getChain(), LdLo->getBasePtr(), TiedIn
277 };
278
279 SDValue NewLoadLo =
280 CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdLo), VTList,
281 Ops, LdLo->getMemoryVT(),
282 LdLo->getMemOperand());
283
284 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadLo);
285 CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdLo, 1), NewLoadLo.getValue(1));
286 return true;
287 }
288
289 return false;
290}
291
293 if (!Subtarget->d16PreservesUnusedBits())
294 return;
295
297
298 bool MadeChange = false;
299 while (Position != CurDAG->allnodes_begin()) {
300 SDNode *N = &*--Position;
301 if (N->use_empty())
302 continue;
303
304 switch (N->getOpcode()) {
306 // TODO: Match load d16 from shl (extload:i16), 16
307 MadeChange |= matchLoadD16FromBuildVector(N);
308 break;
309 default:
310 break;
311 }
312 }
313
314 if (MadeChange) {
316 LLVM_DEBUG(dbgs() << "After PreProcess:\n";
317 CurDAG->dump(););
318 }
319}
320
321bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N) const {
322 if (N->isUndef())
323 return true;
324
325 const SIInstrInfo *TII = Subtarget->getInstrInfo();
326 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N))
327 return TII->isInlineConstant(C->getAPIntValue());
328
329 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N))
330 return TII->isInlineConstant(C->getValueAPF());
331
332 return false;
333}
334
335/// Determine the register class for \p OpNo
336/// \returns The register class of the virtual register that will be used for
337/// the given operand number \OpNo or NULL if the register class cannot be
338/// determined.
339const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
340 unsigned OpNo) const {
341 if (!N->isMachineOpcode()) {
342 if (N->getOpcode() == ISD::CopyToReg) {
343 Register Reg = cast<RegisterSDNode>(N->getOperand(1))->getReg();
344 if (Reg.isVirtual()) {
346 return MRI.getRegClass(Reg);
347 }
348
349 const SIRegisterInfo *TRI
350 = static_cast<const GCNSubtarget *>(Subtarget)->getRegisterInfo();
351 return TRI->getPhysRegBaseClass(Reg);
352 }
353
354 return nullptr;
355 }
356
357 switch (N->getMachineOpcode()) {
358 default: {
359 const MCInstrDesc &Desc =
360 Subtarget->getInstrInfo()->get(N->getMachineOpcode());
361 unsigned OpIdx = Desc.getNumDefs() + OpNo;
362 if (OpIdx >= Desc.getNumOperands())
363 return nullptr;
364 int RegClass = Desc.operands()[OpIdx].RegClass;
365 if (RegClass == -1)
366 return nullptr;
367
368 return Subtarget->getRegisterInfo()->getRegClass(RegClass);
369 }
370 case AMDGPU::REG_SEQUENCE: {
371 unsigned RCID = N->getConstantOperandVal(0);
372 const TargetRegisterClass *SuperRC =
373 Subtarget->getRegisterInfo()->getRegClass(RCID);
374
375 SDValue SubRegOp = N->getOperand(OpNo + 1);
376 unsigned SubRegIdx = SubRegOp->getAsZExtVal();
377 return Subtarget->getRegisterInfo()->getSubClassWithSubReg(SuperRC,
378 SubRegIdx);
379 }
380 }
381}
382
383SDNode *AMDGPUDAGToDAGISel::glueCopyToOp(SDNode *N, SDValue NewChain,
384 SDValue Glue) const {
385 SmallVector <SDValue, 8> Ops;
386 Ops.push_back(NewChain); // Replace the chain.
387 for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
388 Ops.push_back(N->getOperand(i));
389
390 Ops.push_back(Glue);
391 return CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops);
392}
393
394SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N, SDValue Val) const {
396 *static_cast<const SITargetLowering*>(getTargetLowering());
397
398 assert(N->getOperand(0).getValueType() == MVT::Other && "Expected chain");
399
400 SDValue M0 = Lowering.copyToM0(*CurDAG, N->getOperand(0), SDLoc(N), Val);
401 return glueCopyToOp(N, M0, M0.getValue(1));
402}
403
404SDNode *AMDGPUDAGToDAGISel::glueCopyToM0LDSInit(SDNode *N) const {
405 unsigned AS = cast<MemSDNode>(N)->getAddressSpace();
406 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
407 if (Subtarget->ldsRequiresM0Init())
408 return glueCopyToM0(N, CurDAG->getTargetConstant(-1, SDLoc(N), MVT::i32));
409 } else if (AS == AMDGPUAS::REGION_ADDRESS) {
411 unsigned Value = MF.getInfo<SIMachineFunctionInfo>()->getGDSSize();
412 return
413 glueCopyToM0(N, CurDAG->getTargetConstant(Value, SDLoc(N), MVT::i32));
414 }
415 return N;
416}
417
418MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm,
419 EVT VT) const {
421 AMDGPU::S_MOV_B32, DL, MVT::i32,
422 CurDAG->getTargetConstant(Imm & 0xFFFFFFFF, DL, MVT::i32));
423 SDNode *Hi =
424 CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
425 CurDAG->getTargetConstant(Imm >> 32, DL, MVT::i32));
426 const SDValue Ops[] = {
427 CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
428 SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
429 SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
430
431 return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, VT, Ops);
432}
433
434void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
435 EVT VT = N->getValueType(0);
436 unsigned NumVectorElts = VT.getVectorNumElements();
437 EVT EltVT = VT.getVectorElementType();
438 SDLoc DL(N);
439 SDValue RegClass = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
440
441 if (NumVectorElts == 1) {
442 CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT, N->getOperand(0),
443 RegClass);
444 return;
445 }
446
447 assert(NumVectorElts <= 32 && "Vectors with more than 32 elements not "
448 "supported yet");
449 // 32 = Max Num Vector Elements
450 // 2 = 2 REG_SEQUENCE operands per element (value, subreg index)
451 // 1 = Vector Register Class
452 SmallVector<SDValue, 32 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1);
453
454 bool IsGCN = CurDAG->getSubtarget().getTargetTriple().getArch() ==
456 RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
457 bool IsRegSeq = true;
458 unsigned NOps = N->getNumOperands();
459 for (unsigned i = 0; i < NOps; i++) {
460 // XXX: Why is this here?
461 if (isa<RegisterSDNode>(N->getOperand(i))) {
462 IsRegSeq = false;
463 break;
464 }
465 unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
467 RegSeqArgs[1 + (2 * i)] = N->getOperand(i);
468 RegSeqArgs[1 + (2 * i) + 1] = CurDAG->getTargetConstant(Sub, DL, MVT::i32);
469 }
470 if (NOps != NumVectorElts) {
471 // Fill in the missing undef elements if this was a scalar_to_vector.
472 assert(N->getOpcode() == ISD::SCALAR_TO_VECTOR && NOps < NumVectorElts);
473 MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
474 DL, EltVT);
475 for (unsigned i = NOps; i < NumVectorElts; ++i) {
476 unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
478 RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0);
479 RegSeqArgs[1 + (2 * i) + 1] =
480 CurDAG->getTargetConstant(Sub, DL, MVT::i32);
481 }
482 }
483
484 if (!IsRegSeq)
485 SelectCode(N);
486 CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), RegSeqArgs);
487}
488
490 unsigned int Opc = N->getOpcode();
491 if (N->isMachineOpcode()) {
492 N->setNodeId(-1);
493 return; // Already selected.
494 }
495
496 // isa<MemSDNode> almost works but is slightly too permissive for some DS
497 // intrinsics.
498 if (Opc == ISD::LOAD || Opc == ISD::STORE || isa<AtomicSDNode>(N) ||
501 N = glueCopyToM0LDSInit(N);
502 SelectCode(N);
503 return;
504 }
505
506 switch (Opc) {
507 default:
508 break;
509 // We are selecting i64 ADD here instead of custom lower it during
510 // DAG legalization, so we can fold some i64 ADDs used for address
511 // calculation into the LOAD and STORE instructions.
512 case ISD::ADDC:
513 case ISD::ADDE:
514 case ISD::SUBC:
515 case ISD::SUBE: {
516 if (N->getValueType(0) != MVT::i64)
517 break;
518
519 SelectADD_SUB_I64(N);
520 return;
521 }
522 case ISD::UADDO_CARRY:
523 case ISD::USUBO_CARRY:
524 if (N->getValueType(0) != MVT::i32)
525 break;
526
527 SelectAddcSubb(N);
528 return;
529 case ISD::UADDO:
530 case ISD::USUBO: {
531 SelectUADDO_USUBO(N);
532 return;
533 }
535 SelectFMUL_W_CHAIN(N);
536 return;
537 }
539 SelectFMA_W_CHAIN(N);
540 return;
541 }
542
544 case ISD::BUILD_VECTOR: {
545 EVT VT = N->getValueType(0);
546 unsigned NumVectorElts = VT.getVectorNumElements();
547 if (VT.getScalarSizeInBits() == 16) {
548 if (Opc == ISD::BUILD_VECTOR && NumVectorElts == 2) {
549 if (SDNode *Packed = packConstantV2I16(N, *CurDAG)) {
550 ReplaceNode(N, Packed);
551 return;
552 }
553 }
554
555 break;
556 }
557
558 assert(VT.getVectorElementType().bitsEq(MVT::i32));
559 unsigned RegClassID =
560 SIRegisterInfo::getSGPRClassForBitWidth(NumVectorElts * 32)->getID();
561 SelectBuildVector(N, RegClassID);
562 return;
563 }
564 case ISD::BUILD_PAIR: {
565 SDValue RC, SubReg0, SubReg1;
566 SDLoc DL(N);
567 if (N->getValueType(0) == MVT::i128) {
568 RC = CurDAG->getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32);
569 SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32);
570 SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32);
571 } else if (N->getValueType(0) == MVT::i64) {
572 RC = CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32);
573 SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
574 SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
575 } else {
576 llvm_unreachable("Unhandled value type for BUILD_PAIR");
577 }
578 const SDValue Ops[] = { RC, N->getOperand(0), SubReg0,
579 N->getOperand(1), SubReg1 };
580 ReplaceNode(N, CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL,
581 N->getValueType(0), Ops));
582 return;
583 }
584
585 case ISD::Constant:
586 case ISD::ConstantFP: {
587 if (N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N))
588 break;
589
590 uint64_t Imm;
591 if (ConstantFPSDNode *FP = dyn_cast<ConstantFPSDNode>(N)) {
592 Imm = FP->getValueAPF().bitcastToAPInt().getZExtValue();
593 if (AMDGPU::isValid32BitLiteral(Imm, true))
594 break;
595 } else {
596 ConstantSDNode *C = cast<ConstantSDNode>(N);
597 Imm = C->getZExtValue();
598 if (AMDGPU::isValid32BitLiteral(Imm, false))
599 break;
600 }
601
602 SDLoc DL(N);
603 ReplaceNode(N, buildSMovImm64(DL, Imm, N->getValueType(0)));
604 return;
605 }
607 case AMDGPUISD::BFE_U32: {
608 // There is a scalar version available, but unlike the vector version which
609 // has a separate operand for the offset and width, the scalar version packs
610 // the width and offset into a single operand. Try to move to the scalar
611 // version if the offsets are constant, so that we can try to keep extended
612 // loads of kernel arguments in SGPRs.
613
614 // TODO: Technically we could try to pattern match scalar bitshifts of
615 // dynamic values, but it's probably not useful.
616 ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
617 if (!Offset)
618 break;
619
620 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
621 if (!Width)
622 break;
623
624 bool Signed = Opc == AMDGPUISD::BFE_I32;
625
626 uint32_t OffsetVal = Offset->getZExtValue();
627 uint32_t WidthVal = Width->getZExtValue();
628
629 ReplaceNode(N, getBFE32(Signed, SDLoc(N), N->getOperand(0), OffsetVal,
630 WidthVal));
631 return;
632 }
634 SelectDIV_SCALE(N);
635 return;
636 }
639 SelectMAD_64_32(N);
640 return;
641 }
642 case ISD::SMUL_LOHI:
643 case ISD::UMUL_LOHI:
644 return SelectMUL_LOHI(N);
645 case ISD::CopyToReg: {
647 *static_cast<const SITargetLowering*>(getTargetLowering());
648 N = Lowering.legalizeTargetIndependentNode(N, *CurDAG);
649 break;
650 }
651 case ISD::AND:
652 case ISD::SRL:
653 case ISD::SRA:
655 if (N->getValueType(0) != MVT::i32)
656 break;
657
658 SelectS_BFE(N);
659 return;
660 case ISD::BRCOND:
661 SelectBRCOND(N);
662 return;
663 case ISD::FP_EXTEND:
664 SelectFP_EXTEND(N);
665 return;
671 // Hack around using a legal type if f16 is illegal.
672 if (N->getValueType(0) == MVT::i32) {
673 MVT NewVT = Opc == AMDGPUISD::CVT_PKRTZ_F16_F32 ? MVT::v2f16 : MVT::v2i16;
674 N = CurDAG->MorphNodeTo(N, N->getOpcode(), CurDAG->getVTList(NewVT),
675 { N->getOperand(0), N->getOperand(1) });
676 SelectCode(N);
677 return;
678 }
679
680 break;
681 }
683 SelectINTRINSIC_W_CHAIN(N);
684 return;
685 }
687 SelectINTRINSIC_WO_CHAIN(N);
688 return;
689 }
690 case ISD::INTRINSIC_VOID: {
691 SelectINTRINSIC_VOID(N);
692 return;
693 }
695 SelectWAVE_ADDRESS(N);
696 return;
697 }
698 case ISD::STACKRESTORE: {
699 SelectSTACKRESTORE(N);
700 return;
701 }
702 }
703
704 SelectCode(N);
705}
706
707bool AMDGPUDAGToDAGISel::isUniformBr(const SDNode *N) const {
708 const BasicBlock *BB = FuncInfo->MBB->getBasicBlock();
709 const Instruction *Term = BB->getTerminator();
710 return Term->getMetadata("amdgpu.uniform") ||
711 Term->getMetadata("structurizecfg.uniform");
712}
713
714bool AMDGPUDAGToDAGISel::isUnneededShiftMask(const SDNode *N,
715 unsigned ShAmtBits) const {
716 assert(N->getOpcode() == ISD::AND);
717
718 const APInt &RHS = N->getConstantOperandAPInt(1);
719 if (RHS.countr_one() >= ShAmtBits)
720 return true;
721
722 const APInt &LHSKnownZeros = CurDAG->computeKnownBits(N->getOperand(0)).Zero;
723 return (LHSKnownZeros | RHS).countr_one() >= ShAmtBits;
724}
725
727 SDValue &N0, SDValue &N1) {
728 if (Addr.getValueType() == MVT::i64 && Addr.getOpcode() == ISD::BITCAST &&
729 Addr.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
730 // As we split 64-bit `or` earlier, it's complicated pattern to match, i.e.
731 // (i64 (bitcast (v2i32 (build_vector
732 // (or (extract_vector_elt V, 0), OFFSET),
733 // (extract_vector_elt V, 1)))))
734 SDValue Lo = Addr.getOperand(0).getOperand(0);
735 if (Lo.getOpcode() == ISD::OR && DAG.isBaseWithConstantOffset(Lo)) {
736 SDValue BaseLo = Lo.getOperand(0);
737 SDValue BaseHi = Addr.getOperand(0).getOperand(1);
738 // Check that split base (Lo and Hi) are extracted from the same one.
739 if (BaseLo.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
741 BaseLo.getOperand(0) == BaseHi.getOperand(0) &&
742 // Lo is statically extracted from index 0.
743 isa<ConstantSDNode>(BaseLo.getOperand(1)) &&
744 BaseLo.getConstantOperandVal(1) == 0 &&
745 // Hi is statically extracted from index 0.
746 isa<ConstantSDNode>(BaseHi.getOperand(1)) &&
747 BaseHi.getConstantOperandVal(1) == 1) {
748 N0 = BaseLo.getOperand(0).getOperand(0);
749 N1 = Lo.getOperand(1);
750 return true;
751 }
752 }
753 }
754 return false;
755}
756
757bool AMDGPUDAGToDAGISel::isBaseWithConstantOffset64(SDValue Addr, SDValue &LHS,
758 SDValue &RHS) const {
760 LHS = Addr.getOperand(0);
761 RHS = Addr.getOperand(1);
762 return true;
763 }
764
765 if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, LHS, RHS)) {
766 assert(LHS && RHS && isa<ConstantSDNode>(RHS));
767 return true;
768 }
769
770 return false;
771}
772
774 return "AMDGPU DAG->DAG Pattern Instruction Selection";
775}
776
777//===----------------------------------------------------------------------===//
778// Complex Patterns
779//===----------------------------------------------------------------------===//
780
781bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
782 SDValue &Offset) {
783 return false;
784}
785
786bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
787 SDValue &Offset) {
789 SDLoc DL(Addr);
790
791 if ((C = dyn_cast<ConstantSDNode>(Addr))) {
792 Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
793 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
794 } else if ((Addr.getOpcode() == AMDGPUISD::DWORDADDR) &&
795 (C = dyn_cast<ConstantSDNode>(Addr.getOperand(0)))) {
796 Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
797 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
798 } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
799 (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) {
800 Base = Addr.getOperand(0);
801 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
802 } else {
803 Base = Addr;
804 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
805 }
806
807 return true;
808}
809
810SDValue AMDGPUDAGToDAGISel::getMaterializedScalarImm32(int64_t Val,
811 const SDLoc &DL) const {
813 AMDGPU::S_MOV_B32, DL, MVT::i32,
814 CurDAG->getTargetConstant(Val, DL, MVT::i32));
815 return SDValue(Mov, 0);
816}
817
818// FIXME: Should only handle uaddo_carry/usubo_carry
819void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
820 SDLoc DL(N);
821 SDValue LHS = N->getOperand(0);
822 SDValue RHS = N->getOperand(1);
823
824 unsigned Opcode = N->getOpcode();
825 bool ConsumeCarry = (Opcode == ISD::ADDE || Opcode == ISD::SUBE);
826 bool ProduceCarry =
827 ConsumeCarry || Opcode == ISD::ADDC || Opcode == ISD::SUBC;
828 bool IsAdd = Opcode == ISD::ADD || Opcode == ISD::ADDC || Opcode == ISD::ADDE;
829
830 SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
831 SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
832
833 SDNode *Lo0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
834 DL, MVT::i32, LHS, Sub0);
835 SDNode *Hi0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
836 DL, MVT::i32, LHS, Sub1);
837
838 SDNode *Lo1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
839 DL, MVT::i32, RHS, Sub0);
840 SDNode *Hi1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
841 DL, MVT::i32, RHS, Sub1);
842
843 SDVTList VTList = CurDAG->getVTList(MVT::i32, MVT::Glue);
844
845 static const unsigned OpcMap[2][2][2] = {
846 {{AMDGPU::S_SUB_U32, AMDGPU::S_ADD_U32},
847 {AMDGPU::V_SUB_CO_U32_e32, AMDGPU::V_ADD_CO_U32_e32}},
848 {{AMDGPU::S_SUBB_U32, AMDGPU::S_ADDC_U32},
849 {AMDGPU::V_SUBB_U32_e32, AMDGPU::V_ADDC_U32_e32}}};
850
851 unsigned Opc = OpcMap[0][N->isDivergent()][IsAdd];
852 unsigned CarryOpc = OpcMap[1][N->isDivergent()][IsAdd];
853
854 SDNode *AddLo;
855 if (!ConsumeCarry) {
856 SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0) };
857 AddLo = CurDAG->getMachineNode(Opc, DL, VTList, Args);
858 } else {
859 SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0), N->getOperand(2) };
860 AddLo = CurDAG->getMachineNode(CarryOpc, DL, VTList, Args);
861 }
862 SDValue AddHiArgs[] = {
863 SDValue(Hi0, 0),
864 SDValue(Hi1, 0),
865 SDValue(AddLo, 1)
866 };
867 SDNode *AddHi = CurDAG->getMachineNode(CarryOpc, DL, VTList, AddHiArgs);
868
869 SDValue RegSequenceArgs[] = {
870 CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
871 SDValue(AddLo,0),
872 Sub0,
873 SDValue(AddHi,0),
874 Sub1,
875 };
876 SDNode *RegSequence = CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
877 MVT::i64, RegSequenceArgs);
878
879 if (ProduceCarry) {
880 // Replace the carry-use
881 ReplaceUses(SDValue(N, 1), SDValue(AddHi, 1));
882 }
883
884 // Replace the remaining uses.
885 ReplaceNode(N, RegSequence);
886}
887
888void AMDGPUDAGToDAGISel::SelectAddcSubb(SDNode *N) {
889 SDLoc DL(N);
890 SDValue LHS = N->getOperand(0);
891 SDValue RHS = N->getOperand(1);
892 SDValue CI = N->getOperand(2);
893
894 if (N->isDivergent()) {
895 unsigned Opc = N->getOpcode() == ISD::UADDO_CARRY ? AMDGPU::V_ADDC_U32_e64
896 : AMDGPU::V_SUBB_U32_e64;
898 N, Opc, N->getVTList(),
899 {LHS, RHS, CI,
900 CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
901 } else {
902 unsigned Opc = N->getOpcode() == ISD::UADDO_CARRY ? AMDGPU::S_ADD_CO_PSEUDO
903 : AMDGPU::S_SUB_CO_PSEUDO;
904 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), {LHS, RHS, CI});
905 }
906}
907
908void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) {
909 // The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned
910 // carry out despite the _i32 name. These were renamed in VI to _U32.
911 // FIXME: We should probably rename the opcodes here.
912 bool IsAdd = N->getOpcode() == ISD::UADDO;
913 bool IsVALU = N->isDivergent();
914
915 for (SDNode::use_iterator UI = N->use_begin(), E = N->use_end(); UI != E;
916 ++UI)
917 if (UI.getUse().getResNo() == 1) {
918 if ((IsAdd && (UI->getOpcode() != ISD::UADDO_CARRY)) ||
919 (!IsAdd && (UI->getOpcode() != ISD::USUBO_CARRY))) {
920 IsVALU = true;
921 break;
922 }
923 }
924
925 if (IsVALU) {
926 unsigned Opc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
927
929 N, Opc, N->getVTList(),
930 {N->getOperand(0), N->getOperand(1),
931 CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
932 } else {
933 unsigned Opc = N->getOpcode() == ISD::UADDO ? AMDGPU::S_UADDO_PSEUDO
934 : AMDGPU::S_USUBO_PSEUDO;
935
936 CurDAG->SelectNodeTo(N, Opc, N->getVTList(),
937 {N->getOperand(0), N->getOperand(1)});
938 }
939}
940
941void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) {
942 SDLoc SL(N);
943 // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp, omod
944 SDValue Ops[10];
945
946 SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[6], Ops[7]);
947 SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
948 SelectVOP3Mods(N->getOperand(3), Ops[5], Ops[4]);
949 Ops[8] = N->getOperand(0);
950 Ops[9] = N->getOperand(4);
951
952 // If there are no source modifiers, prefer fmac over fma because it can use
953 // the smaller VOP2 encoding.
954 bool UseFMAC = Subtarget->hasDLInsts() &&
955 cast<ConstantSDNode>(Ops[0])->isZero() &&
956 cast<ConstantSDNode>(Ops[2])->isZero() &&
957 cast<ConstantSDNode>(Ops[4])->isZero();
958 unsigned Opcode = UseFMAC ? AMDGPU::V_FMAC_F32_e64 : AMDGPU::V_FMA_F32_e64;
959 CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), Ops);
960}
961
962void AMDGPUDAGToDAGISel::SelectFMUL_W_CHAIN(SDNode *N) {
963 SDLoc SL(N);
964 // src0_modifiers, src0, src1_modifiers, src1, clamp, omod
965 SDValue Ops[8];
966
967 SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[4], Ops[5]);
968 SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
969 Ops[6] = N->getOperand(0);
970 Ops[7] = N->getOperand(3);
971
972 CurDAG->SelectNodeTo(N, AMDGPU::V_MUL_F32_e64, N->getVTList(), Ops);
973}
974
975// We need to handle this here because tablegen doesn't support matching
976// instructions with multiple outputs.
977void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
978 SDLoc SL(N);
979 EVT VT = N->getValueType(0);
980
981 assert(VT == MVT::f32 || VT == MVT::f64);
982
983 unsigned Opc
984 = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64_e64 : AMDGPU::V_DIV_SCALE_F32_e64;
985
986 // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp,
987 // omod
988 SDValue Ops[8];
989 SelectVOP3BMods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]);
990 SelectVOP3BMods(N->getOperand(1), Ops[3], Ops[2]);
991 SelectVOP3BMods(N->getOperand(2), Ops[5], Ops[4]);
992 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
993}
994
995// We need to handle this here because tablegen doesn't support matching
996// instructions with multiple outputs.
997void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {
998 SDLoc SL(N);
999 bool Signed = N->getOpcode() == AMDGPUISD::MAD_I64_I32;
1000 unsigned Opc;
1001 if (Subtarget->hasMADIntraFwdBug())
1002 Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
1003 : AMDGPU::V_MAD_U64_U32_gfx11_e64;
1004 else
1005 Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
1006
1007 SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
1008 SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
1009 Clamp };
1010 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
1011}
1012
1013// We need to handle this here because tablegen doesn't support matching
1014// instructions with multiple outputs.
1015void AMDGPUDAGToDAGISel::SelectMUL_LOHI(SDNode *N) {
1016 SDLoc SL(N);
1017 bool Signed = N->getOpcode() == ISD::SMUL_LOHI;
1018 unsigned Opc;
1019 if (Subtarget->hasMADIntraFwdBug())
1020 Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
1021 : AMDGPU::V_MAD_U64_U32_gfx11_e64;
1022 else
1023 Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
1024
1025 SDValue Zero = CurDAG->getTargetConstant(0, SL, MVT::i64);
1026 SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
1027 SDValue Ops[] = {N->getOperand(0), N->getOperand(1), Zero, Clamp};
1028 SDNode *Mad = CurDAG->getMachineNode(Opc, SL, N->getVTList(), Ops);
1029 if (!SDValue(N, 0).use_empty()) {
1030 SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32);
1031 SDNode *Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL,
1032 MVT::i32, SDValue(Mad, 0), Sub0);
1033 ReplaceUses(SDValue(N, 0), SDValue(Lo, 0));
1034 }
1035 if (!SDValue(N, 1).use_empty()) {
1036 SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32);
1037 SDNode *Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL,
1038 MVT::i32, SDValue(Mad, 0), Sub1);
1039 ReplaceUses(SDValue(N, 1), SDValue(Hi, 0));
1040 }
1042}
1043
1044bool AMDGPUDAGToDAGISel::isDSOffsetLegal(SDValue Base, unsigned Offset) const {
1045 if (!isUInt<16>(Offset))
1046 return false;
1047
1048 if (!Base || Subtarget->hasUsableDSOffset() ||
1049 Subtarget->unsafeDSOffsetFoldingEnabled())
1050 return true;
1051
1052 // On Southern Islands instruction with a negative base value and an offset
1053 // don't seem to work.
1054 return CurDAG->SignBitIsZero(Base);
1055}
1056
1057bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
1058 SDValue &Offset) const {
1059 SDLoc DL(Addr);
1061 SDValue N0 = Addr.getOperand(0);
1062 SDValue N1 = Addr.getOperand(1);
1063 ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1064 if (isDSOffsetLegal(N0, C1->getSExtValue())) {
1065 // (add n0, c0)
1066 Base = N0;
1067 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
1068 return true;
1069 }
1070 } else if (Addr.getOpcode() == ISD::SUB) {
1071 // sub C, x -> add (sub 0, x), C
1072 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
1073 int64_t ByteOffset = C->getSExtValue();
1074 if (isDSOffsetLegal(SDValue(), ByteOffset)) {
1075 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1076
1077 // XXX - This is kind of hacky. Create a dummy sub node so we can check
1078 // the known bits in isDSOffsetLegal. We need to emit the selected node
1079 // here, so this is thrown away.
1080 SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32,
1081 Zero, Addr.getOperand(1));
1082
1083 if (isDSOffsetLegal(Sub, ByteOffset)) {
1085 Opnds.push_back(Zero);
1086 Opnds.push_back(Addr.getOperand(1));
1087
1088 // FIXME: Select to VOP3 version for with-carry.
1089 unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1090 if (Subtarget->hasAddNoCarry()) {
1091 SubOp = AMDGPU::V_SUB_U32_e64;
1092 Opnds.push_back(
1093 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
1094 }
1095
1096 MachineSDNode *MachineSub =
1097 CurDAG->getMachineNode(SubOp, DL, MVT::i32, Opnds);
1098
1099 Base = SDValue(MachineSub, 0);
1100 Offset = CurDAG->getTargetConstant(ByteOffset, DL, MVT::i16);
1101 return true;
1102 }
1103 }
1104 }
1105 } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1106 // If we have a constant address, prefer to put the constant into the
1107 // offset. This can save moves to load the constant address since multiple
1108 // operations can share the zero base address register, and enables merging
1109 // into read2 / write2 instructions.
1110
1111 SDLoc DL(Addr);
1112
1113 if (isDSOffsetLegal(SDValue(), CAddr->getZExtValue())) {
1114 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1115 MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
1116 DL, MVT::i32, Zero);
1117 Base = SDValue(MovZero, 0);
1118 Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16);
1119 return true;
1120 }
1121 }
1122
1123 // default case
1124 Base = Addr;
1125 Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i16);
1126 return true;
1127}
1128
1129bool AMDGPUDAGToDAGISel::isDSOffset2Legal(SDValue Base, unsigned Offset0,
1130 unsigned Offset1,
1131 unsigned Size) const {
1132 if (Offset0 % Size != 0 || Offset1 % Size != 0)
1133 return false;
1134 if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
1135 return false;
1136
1137 if (!Base || Subtarget->hasUsableDSOffset() ||
1138 Subtarget->unsafeDSOffsetFoldingEnabled())
1139 return true;
1140
1141 // On Southern Islands instruction with a negative base value and an offset
1142 // don't seem to work.
1143 return CurDAG->SignBitIsZero(Base);
1144}
1145
1146// Return whether the operation has NoUnsignedWrap property.
1148 return (Addr.getOpcode() == ISD::ADD &&
1149 Addr->getFlags().hasNoUnsignedWrap()) ||
1150 Addr->getOpcode() == ISD::OR;
1151}
1152
1153// Check that the base address of flat scratch load/store in the form of `base +
1154// offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware
1155// requirement). We always treat the first operand as the base address here.
1156bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue Addr) const {
1158 return true;
1159
1160 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1161 // values.
1162 if (Subtarget->hasSignedScratchOffsets())
1163 return true;
1164
1165 auto LHS = Addr.getOperand(0);
1166 auto RHS = Addr.getOperand(1);
1167
1168 // If the immediate offset is negative and within certain range, the base
1169 // address cannot also be negative. If the base is also negative, the sum
1170 // would be either negative or much larger than the valid range of scratch
1171 // memory a thread can access.
1172 ConstantSDNode *ImmOp = nullptr;
1173 if (Addr.getOpcode() == ISD::ADD && (ImmOp = dyn_cast<ConstantSDNode>(RHS))) {
1174 if (ImmOp->getSExtValue() < 0 && ImmOp->getSExtValue() > -0x40000000)
1175 return true;
1176 }
1177
1178 return CurDAG->SignBitIsZero(LHS);
1179}
1180
1181// Check address value in SGPR/VGPR are legal for flat scratch in the form
1182// of: SGPR + VGPR.
1183bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSV(SDValue Addr) const {
1185 return true;
1186
1187 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1188 // values.
1189 if (Subtarget->hasSignedScratchOffsets())
1190 return true;
1191
1192 auto LHS = Addr.getOperand(0);
1193 auto RHS = Addr.getOperand(1);
1194 return CurDAG->SignBitIsZero(RHS) && CurDAG->SignBitIsZero(LHS);
1195}
1196
1197// Check address value in SGPR/VGPR are legal for flat scratch in the form
1198// of: SGPR + VGPR + Imm.
1199bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSVImm(SDValue Addr) const {
1200 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1201 // values.
1202 if (AMDGPU::isGFX12Plus(*Subtarget))
1203 return true;
1204
1205 auto Base = Addr.getOperand(0);
1206 auto *RHSImm = cast<ConstantSDNode>(Addr.getOperand(1));
1207 // If the immediate offset is negative and within certain range, the base
1208 // address cannot also be negative. If the base is also negative, the sum
1209 // would be either negative or much larger than the valid range of scratch
1210 // memory a thread can access.
1211 if (isNoUnsignedWrap(Base) &&
1213 (RHSImm->getSExtValue() < 0 && RHSImm->getSExtValue() > -0x40000000)))
1214 return true;
1215
1216 auto LHS = Base.getOperand(0);
1217 auto RHS = Base.getOperand(1);
1218 return CurDAG->SignBitIsZero(RHS) && CurDAG->SignBitIsZero(LHS);
1219}
1220
1221// TODO: If offset is too big, put low 16-bit into offset.
1222bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,
1223 SDValue &Offset0,
1224 SDValue &Offset1) const {
1225 return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 4);
1226}
1227
1228bool AMDGPUDAGToDAGISel::SelectDS128Bit8ByteAligned(SDValue Addr, SDValue &Base,
1229 SDValue &Offset0,
1230 SDValue &Offset1) const {
1231 return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 8);
1232}
1233
1234bool AMDGPUDAGToDAGISel::SelectDSReadWrite2(SDValue Addr, SDValue &Base,
1235 SDValue &Offset0, SDValue &Offset1,
1236 unsigned Size) const {
1237 SDLoc DL(Addr);
1238
1240 SDValue N0 = Addr.getOperand(0);
1241 SDValue N1 = Addr.getOperand(1);
1242 ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1243 unsigned OffsetValue0 = C1->getZExtValue();
1244 unsigned OffsetValue1 = OffsetValue0 + Size;
1245
1246 // (add n0, c0)
1247 if (isDSOffset2Legal(N0, OffsetValue0, OffsetValue1, Size)) {
1248 Base = N0;
1249 Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i8);
1250 Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i8);
1251 return true;
1252 }
1253 } else if (Addr.getOpcode() == ISD::SUB) {
1254 // sub C, x -> add (sub 0, x), C
1255 if (const ConstantSDNode *C =
1256 dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
1257 unsigned OffsetValue0 = C->getZExtValue();
1258 unsigned OffsetValue1 = OffsetValue0 + Size;
1259
1260 if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {
1261 SDLoc DL(Addr);
1262 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1263
1264 // XXX - This is kind of hacky. Create a dummy sub node so we can check
1265 // the known bits in isDSOffsetLegal. We need to emit the selected node
1266 // here, so this is thrown away.
1267 SDValue Sub =
1268 CurDAG->getNode(ISD::SUB, DL, MVT::i32, Zero, Addr.getOperand(1));
1269
1270 if (isDSOffset2Legal(Sub, OffsetValue0, OffsetValue1, Size)) {
1272 Opnds.push_back(Zero);
1273 Opnds.push_back(Addr.getOperand(1));
1274 unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1275 if (Subtarget->hasAddNoCarry()) {
1276 SubOp = AMDGPU::V_SUB_U32_e64;
1277 Opnds.push_back(
1278 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
1279 }
1280
1281 MachineSDNode *MachineSub = CurDAG->getMachineNode(
1282 SubOp, DL, MVT::getIntegerVT(Size * 8), Opnds);
1283
1284 Base = SDValue(MachineSub, 0);
1285 Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i8);
1286 Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i8);
1287 return true;
1288 }
1289 }
1290 }
1291 } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1292 unsigned OffsetValue0 = CAddr->getZExtValue();
1293 unsigned OffsetValue1 = OffsetValue0 + Size;
1294
1295 if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {
1296 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1297 MachineSDNode *MovZero =
1298 CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32, Zero);
1299 Base = SDValue(MovZero, 0);
1300 Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i8);
1301 Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i8);
1302 return true;
1303 }
1304 }
1305
1306 // default case
1307
1308 Base = Addr;
1309 Offset0 = CurDAG->getTargetConstant(0, DL, MVT::i8);
1310 Offset1 = CurDAG->getTargetConstant(1, DL, MVT::i8);
1311 return true;
1312}
1313
1314bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDValue &VAddr,
1315 SDValue &SOffset, SDValue &Offset,
1316 SDValue &Offen, SDValue &Idxen,
1317 SDValue &Addr64) const {
1318 // Subtarget prefers to use flat instruction
1319 // FIXME: This should be a pattern predicate and not reach here
1320 if (Subtarget->useFlatForGlobal())
1321 return false;
1322
1323 SDLoc DL(Addr);
1324
1325 Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1);
1326 Offen = CurDAG->getTargetConstant(0, DL, MVT::i1);
1327 Addr64 = CurDAG->getTargetConstant(0, DL, MVT::i1);
1328 SOffset = Subtarget->hasRestrictedSOffset()
1329 ? CurDAG->getRegister(AMDGPU::SGPR_NULL, MVT::i32)
1330 : CurDAG->getTargetConstant(0, DL, MVT::i32);
1331
1332 ConstantSDNode *C1 = nullptr;
1333 SDValue N0 = Addr;
1335 C1 = cast<ConstantSDNode>(Addr.getOperand(1));
1336 if (isUInt<32>(C1->getZExtValue()))
1337 N0 = Addr.getOperand(0);
1338 else
1339 C1 = nullptr;
1340 }
1341
1342 if (N0.getOpcode() == ISD::ADD) {
1343 // (add N2, N3) -> addr64, or
1344 // (add (add N2, N3), C1) -> addr64
1345 SDValue N2 = N0.getOperand(0);
1346 SDValue N3 = N0.getOperand(1);
1347 Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
1348
1349 if (N2->isDivergent()) {
1350 if (N3->isDivergent()) {
1351 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
1352 // addr64, and construct the resource from a 0 address.
1353 Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
1354 VAddr = N0;
1355 } else {
1356 // N2 is divergent, N3 is not.
1357 Ptr = N3;
1358 VAddr = N2;
1359 }
1360 } else {
1361 // N2 is not divergent.
1362 Ptr = N2;
1363 VAddr = N3;
1364 }
1365 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1366 } else if (N0->isDivergent()) {
1367 // N0 is divergent. Use it as the addr64, and construct the resource from a
1368 // 0 address.
1369 Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
1370 VAddr = N0;
1371 Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
1372 } else {
1373 // N0 -> offset, or
1374 // (N0 + C1) -> offset
1375 VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32);
1376 Ptr = N0;
1377 }
1378
1379 if (!C1) {
1380 // No offset.
1381 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1382 return true;
1383 }
1384
1385 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1386 if (TII->isLegalMUBUFImmOffset(C1->getZExtValue())) {
1387 // Legal offset for instruction.
1388 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
1389 return true;
1390 }
1391
1392 // Illegal offset, store it in soffset.
1393 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1394 SOffset =
1396 AMDGPU::S_MOV_B32, DL, MVT::i32,
1397 CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32)),
1398 0);
1399 return true;
1400}
1401
1402bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
1403 SDValue &VAddr, SDValue &SOffset,
1404 SDValue &Offset) const {
1405 SDValue Ptr, Offen, Idxen, Addr64;
1406
1407 // addr64 bit was removed for volcanic islands.
1408 // FIXME: This should be a pattern predicate and not reach here
1409 if (!Subtarget->hasAddr64())
1410 return false;
1411
1412 if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
1413 return false;
1414
1415 ConstantSDNode *C = cast<ConstantSDNode>(Addr64);
1416 if (C->getSExtValue()) {
1417 SDLoc DL(Addr);
1418
1419 const SITargetLowering& Lowering =
1420 *static_cast<const SITargetLowering*>(getTargetLowering());
1421
1422 SRsrc = SDValue(Lowering.wrapAddr64Rsrc(*CurDAG, DL, Ptr), 0);
1423 return true;
1424 }
1425
1426 return false;
1427}
1428
1429std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const {
1430 SDLoc DL(N);
1431
1432 auto *FI = dyn_cast<FrameIndexSDNode>(N);
1433 SDValue TFI =
1434 FI ? CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0)) : N;
1435
1436 // We rebase the base address into an absolute stack address and hence
1437 // use constant 0 for soffset. This value must be retained until
1438 // frame elimination and eliminateFrameIndex will choose the appropriate
1439 // frame register if need be.
1440 return std::pair(TFI, CurDAG->getTargetConstant(0, DL, MVT::i32));
1441}
1442
1443bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent,
1444 SDValue Addr, SDValue &Rsrc,
1445 SDValue &VAddr, SDValue &SOffset,
1446 SDValue &ImmOffset) const {
1447
1448 SDLoc DL(Addr);
1451
1452 Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1453
1454 if (ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1455 int64_t Imm = CAddr->getSExtValue();
1456 const int64_t NullPtr =
1458 // Don't fold null pointer.
1459 if (Imm != NullPtr) {
1460 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
1461 SDValue HighBits =
1462 CurDAG->getTargetConstant(Imm & ~MaxOffset, DL, MVT::i32);
1463 MachineSDNode *MovHighBits = CurDAG->getMachineNode(
1464 AMDGPU::V_MOV_B32_e32, DL, MVT::i32, HighBits);
1465 VAddr = SDValue(MovHighBits, 0);
1466
1467 SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1468 ImmOffset = CurDAG->getTargetConstant(Imm & MaxOffset, DL, MVT::i32);
1469 return true;
1470 }
1471 }
1472
1474 // (add n0, c1)
1475
1476 SDValue N0 = Addr.getOperand(0);
1477 uint64_t C1 = Addr.getConstantOperandVal(1);
1478
1479 // Offsets in vaddr must be positive if range checking is enabled.
1480 //
1481 // The total computation of vaddr + soffset + offset must not overflow. If
1482 // vaddr is negative, even if offset is 0 the sgpr offset add will end up
1483 // overflowing.
1484 //
1485 // Prior to gfx9, MUBUF instructions with the vaddr offset enabled would
1486 // always perform a range check. If a negative vaddr base index was used,
1487 // this would fail the range check. The overall address computation would
1488 // compute a valid address, but this doesn't happen due to the range
1489 // check. For out-of-bounds MUBUF loads, a 0 is returned.
1490 //
1491 // Therefore it should be safe to fold any VGPR offset on gfx9 into the
1492 // MUBUF vaddr, but not on older subtargets which can only do this if the
1493 // sign bit is known 0.
1494 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1495 if (TII->isLegalMUBUFImmOffset(C1) &&
1496 (!Subtarget->privateMemoryResourceIsRangeChecked() ||
1497 CurDAG->SignBitIsZero(N0))) {
1498 std::tie(VAddr, SOffset) = foldFrameIndex(N0);
1499 ImmOffset = CurDAG->getTargetConstant(C1, DL, MVT::i32);
1500 return true;
1501 }
1502 }
1503
1504 // (node)
1505 std::tie(VAddr, SOffset) = foldFrameIndex(Addr);
1506 ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1507 return true;
1508}
1509
1510static bool IsCopyFromSGPR(const SIRegisterInfo &TRI, SDValue Val) {
1511 if (Val.getOpcode() != ISD::CopyFromReg)
1512 return false;
1513 auto Reg = cast<RegisterSDNode>(Val.getOperand(1))->getReg();
1514 if (!Reg.isPhysical())
1515 return false;
1516 auto RC = TRI.getPhysRegBaseClass(Reg);
1517 return RC && TRI.isSGPRClass(RC);
1518}
1519
1520bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent,
1521 SDValue Addr,
1522 SDValue &SRsrc,
1523 SDValue &SOffset,
1524 SDValue &Offset) const {
1525 const SIRegisterInfo *TRI =
1526 static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
1527 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1530 SDLoc DL(Addr);
1531
1532 // CopyFromReg <sgpr>
1533 if (IsCopyFromSGPR(*TRI, Addr)) {
1534 SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1535 SOffset = Addr;
1536 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1537 return true;
1538 }
1539
1540 ConstantSDNode *CAddr;
1541 if (Addr.getOpcode() == ISD::ADD) {
1542 // Add (CopyFromReg <sgpr>) <constant>
1543 CAddr = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
1544 if (!CAddr || !TII->isLegalMUBUFImmOffset(CAddr->getZExtValue()))
1545 return false;
1546 if (!IsCopyFromSGPR(*TRI, Addr.getOperand(0)))
1547 return false;
1548
1549 SOffset = Addr.getOperand(0);
1550 } else if ((CAddr = dyn_cast<ConstantSDNode>(Addr)) &&
1551 TII->isLegalMUBUFImmOffset(CAddr->getZExtValue())) {
1552 // <constant>
1553 SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1554 } else {
1555 return false;
1556 }
1557
1558 SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1559
1560 Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i32);
1561 return true;
1562}
1563
1564bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
1565 SDValue &SOffset, SDValue &Offset
1566 ) const {
1567 SDValue Ptr, VAddr, Offen, Idxen, Addr64;
1568 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1569
1570 if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
1571 return false;
1572
1573 if (!cast<ConstantSDNode>(Offen)->getSExtValue() &&
1574 !cast<ConstantSDNode>(Idxen)->getSExtValue() &&
1575 !cast<ConstantSDNode>(Addr64)->getSExtValue()) {
1576 uint64_t Rsrc = TII->getDefaultRsrcDataFormat() |
1577 APInt::getAllOnes(32).getZExtValue(); // Size
1578 SDLoc DL(Addr);
1579
1580 const SITargetLowering& Lowering =
1581 *static_cast<const SITargetLowering*>(getTargetLowering());
1582
1583 SRsrc = SDValue(Lowering.buildRSRC(*CurDAG, DL, Ptr, 0, Rsrc), 0);
1584 return true;
1585 }
1586 return false;
1587}
1588
1589bool AMDGPUDAGToDAGISel::SelectBUFSOffset(SDValue ByteOffsetNode,
1590 SDValue &SOffset) const {
1591 if (Subtarget->hasRestrictedSOffset() && isNullConstant(ByteOffsetNode)) {
1592 SOffset = CurDAG->getRegister(AMDGPU::SGPR_NULL, MVT::i32);
1593 return true;
1594 }
1595
1596 SOffset = ByteOffsetNode;
1597 return true;
1598}
1599
1600// Find a load or store from corresponding pattern root.
1601// Roots may be build_vector, bitconvert or their combinations.
1604 if (MemSDNode *MN = dyn_cast<MemSDNode>(N))
1605 return MN;
1606 assert(isa<BuildVectorSDNode>(N));
1607 for (SDValue V : N->op_values())
1608 if (MemSDNode *MN =
1609 dyn_cast<MemSDNode>(AMDGPUTargetLowering::stripBitcast(V)))
1610 return MN;
1611 llvm_unreachable("cannot find MemSDNode in the pattern!");
1612}
1613
1614bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr,
1615 SDValue &VAddr, SDValue &Offset,
1616 uint64_t FlatVariant) const {
1617 int64_t OffsetVal = 0;
1618
1619 unsigned AS = findMemSDNode(N)->getAddressSpace();
1620
1621 bool CanHaveFlatSegmentOffsetBug =
1622 Subtarget->hasFlatSegmentOffsetBug() &&
1623 FlatVariant == SIInstrFlags::FLAT &&
1625
1626 if (Subtarget->hasFlatInstOffsets() && !CanHaveFlatSegmentOffsetBug) {
1627 SDValue N0, N1;
1628 if (isBaseWithConstantOffset64(Addr, N0, N1) &&
1629 (FlatVariant != SIInstrFlags::FlatScratch ||
1630 isFlatScratchBaseLegal(Addr))) {
1631 int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
1632
1633 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1634 if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) {
1635 Addr = N0;
1636 OffsetVal = COffsetVal;
1637 } else {
1638 // If the offset doesn't fit, put the low bits into the offset field and
1639 // add the rest.
1640 //
1641 // For a FLAT instruction the hardware decides whether to access
1642 // global/scratch/shared memory based on the high bits of vaddr,
1643 // ignoring the offset field, so we have to ensure that when we add
1644 // remainder to vaddr it still points into the same underlying object.
1645 // The easiest way to do that is to make sure that we split the offset
1646 // into two pieces that are both >= 0 or both <= 0.
1647
1648 SDLoc DL(N);
1649 uint64_t RemainderOffset;
1650
1651 std::tie(OffsetVal, RemainderOffset) =
1652 TII->splitFlatOffset(COffsetVal, AS, FlatVariant);
1653
1654 SDValue AddOffsetLo =
1655 getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
1656 SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
1657
1658 if (Addr.getValueType().getSizeInBits() == 32) {
1660 Opnds.push_back(N0);
1661 Opnds.push_back(AddOffsetLo);
1662 unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32;
1663 if (Subtarget->hasAddNoCarry()) {
1664 AddOp = AMDGPU::V_ADD_U32_e64;
1665 Opnds.push_back(Clamp);
1666 }
1667 Addr = SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0);
1668 } else {
1669 // TODO: Should this try to use a scalar add pseudo if the base address
1670 // is uniform and saddr is usable?
1671 SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
1672 SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
1673
1674 SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1675 DL, MVT::i32, N0, Sub0);
1676 SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1677 DL, MVT::i32, N0, Sub1);
1678
1679 SDValue AddOffsetHi =
1680 getMaterializedScalarImm32(Hi_32(RemainderOffset), DL);
1681
1682 SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1);
1683
1684 SDNode *Add =
1685 CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs,
1686 {AddOffsetLo, SDValue(N0Lo, 0), Clamp});
1687
1688 SDNode *Addc = CurDAG->getMachineNode(
1689 AMDGPU::V_ADDC_U32_e64, DL, VTs,
1690 {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp});
1691
1692 SDValue RegSequenceArgs[] = {
1693 CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32),
1694 SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1};
1695
1696 Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
1697 MVT::i64, RegSequenceArgs),
1698 0);
1699 }
1700 }
1701 }
1702 }
1703
1704 VAddr = Addr;
1705 Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i32);
1706 return true;
1707}
1708
1709bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N, SDValue Addr,
1710 SDValue &VAddr,
1711 SDValue &Offset) const {
1712 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, SIInstrFlags::FLAT);
1713}
1714
1715bool AMDGPUDAGToDAGISel::SelectGlobalOffset(SDNode *N, SDValue Addr,
1716 SDValue &VAddr,
1717 SDValue &Offset) const {
1718 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, SIInstrFlags::FlatGlobal);
1719}
1720
1721bool AMDGPUDAGToDAGISel::SelectScratchOffset(SDNode *N, SDValue Addr,
1722 SDValue &VAddr,
1723 SDValue &Offset) const {
1724 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset,
1726}
1727
1728// If this matches zero_extend i32:x, return x
1730 if (Op.getOpcode() != ISD::ZERO_EXTEND)
1731 return SDValue();
1732
1733 SDValue ExtSrc = Op.getOperand(0);
1734 return (ExtSrc.getValueType() == MVT::i32) ? ExtSrc : SDValue();
1735}
1736
1737// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
1738bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
1739 SDValue Addr,
1740 SDValue &SAddr,
1741 SDValue &VOffset,
1742 SDValue &Offset) const {
1743 int64_t ImmOffset = 0;
1744
1745 // Match the immediate offset first, which canonically is moved as low as
1746 // possible.
1747
1748 SDValue LHS, RHS;
1749 if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
1750 int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
1751 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1752
1753 if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS,
1755 Addr = LHS;
1756 ImmOffset = COffsetVal;
1757 } else if (!LHS->isDivergent()) {
1758 if (COffsetVal > 0) {
1759 SDLoc SL(N);
1760 // saddr + large_offset -> saddr +
1761 // (voffset = large_offset & ~MaxOffset) +
1762 // (large_offset & MaxOffset);
1763 int64_t SplitImmOffset, RemainderOffset;
1764 std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
1766
1767 if (isUInt<32>(RemainderOffset)) {
1768 SDNode *VMov = CurDAG->getMachineNode(
1769 AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
1770 CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
1771 VOffset = SDValue(VMov, 0);
1772 SAddr = LHS;
1773 Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i32);
1774 return true;
1775 }
1776 }
1777
1778 // We are adding a 64 bit SGPR and a constant. If constant bus limit
1779 // is 1 we would need to perform 1 or 2 extra moves for each half of
1780 // the constant and it is better to do a scalar add and then issue a
1781 // single VALU instruction to materialize zero. Otherwise it is less
1782 // instructions to perform VALU adds with immediates or inline literals.
1783 unsigned NumLiterals =
1784 !TII->isInlineConstant(APInt(32, COffsetVal & 0xffffffff)) +
1785 !TII->isInlineConstant(APInt(32, COffsetVal >> 32));
1786 if (Subtarget->getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
1787 return false;
1788 }
1789 }
1790
1791 // Match the variable offset.
1792 if (Addr.getOpcode() == ISD::ADD) {
1793 LHS = Addr.getOperand(0);
1794 RHS = Addr.getOperand(1);
1795
1796 if (!LHS->isDivergent()) {
1797 // add (i64 sgpr), (zero_extend (i32 vgpr))
1798 if (SDValue ZextRHS = matchZExtFromI32(RHS)) {
1799 SAddr = LHS;
1800 VOffset = ZextRHS;
1801 }
1802 }
1803
1804 if (!SAddr && !RHS->isDivergent()) {
1805 // add (zero_extend (i32 vgpr)), (i64 sgpr)
1806 if (SDValue ZextLHS = matchZExtFromI32(LHS)) {
1807 SAddr = RHS;
1808 VOffset = ZextLHS;
1809 }
1810 }
1811
1812 if (SAddr) {
1813 Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i32);
1814 return true;
1815 }
1816 }
1817
1818 if (Addr->isDivergent() || Addr.getOpcode() == ISD::UNDEF ||
1819 isa<ConstantSDNode>(Addr))
1820 return false;
1821
1822 // It's cheaper to materialize a single 32-bit zero for vaddr than the two
1823 // moves required to copy a 64-bit SGPR to VGPR.
1824 SAddr = Addr;
1825 SDNode *VMov =
1826 CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, SDLoc(Addr), MVT::i32,
1827 CurDAG->getTargetConstant(0, SDLoc(), MVT::i32));
1828 VOffset = SDValue(VMov, 0);
1829 Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i32);
1830 return true;
1831}
1832
1834 if (auto FI = dyn_cast<FrameIndexSDNode>(SAddr)) {
1835 SAddr = CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0));
1836 } else if (SAddr.getOpcode() == ISD::ADD &&
1837 isa<FrameIndexSDNode>(SAddr.getOperand(0))) {
1838 // Materialize this into a scalar move for scalar address to avoid
1839 // readfirstlane.
1840 auto FI = cast<FrameIndexSDNode>(SAddr.getOperand(0));
1841 SDValue TFI = CurDAG->getTargetFrameIndex(FI->getIndex(),
1842 FI->getValueType(0));
1843 SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, SDLoc(SAddr),
1844 MVT::i32, TFI, SAddr.getOperand(1)),
1845 0);
1846 }
1847
1848 return SAddr;
1849}
1850
1851// Match (32-bit SGPR base) + sext(imm offset)
1852bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr,
1853 SDValue &SAddr,
1854 SDValue &Offset) const {
1855 if (Addr->isDivergent())
1856 return false;
1857
1858 SDLoc DL(Addr);
1859
1860 int64_t COffsetVal = 0;
1861
1862 if (CurDAG->isBaseWithConstantOffset(Addr) && isFlatScratchBaseLegal(Addr)) {
1863 COffsetVal = cast<ConstantSDNode>(Addr.getOperand(1))->getSExtValue();
1864 SAddr = Addr.getOperand(0);
1865 } else {
1866 SAddr = Addr;
1867 }
1868
1869 SAddr = SelectSAddrFI(CurDAG, SAddr);
1870
1871 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1872
1873 if (!TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS,
1875 int64_t SplitImmOffset, RemainderOffset;
1876 std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
1878
1879 COffsetVal = SplitImmOffset;
1880
1881 SDValue AddOffset =
1883 ? getMaterializedScalarImm32(Lo_32(RemainderOffset), DL)
1884 : CurDAG->getTargetConstant(RemainderOffset, DL, MVT::i32);
1885 SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, DL, MVT::i32,
1886 SAddr, AddOffset),
1887 0);
1888 }
1889
1890 Offset = CurDAG->getTargetConstant(COffsetVal, DL, MVT::i16);
1891
1892 return true;
1893}
1894
1895// Check whether the flat scratch SVS swizzle bug affects this access.
1896bool AMDGPUDAGToDAGISel::checkFlatScratchSVSSwizzleBug(
1897 SDValue VAddr, SDValue SAddr, uint64_t ImmOffset) const {
1898 if (!Subtarget->hasFlatScratchSVSSwizzleBug())
1899 return false;
1900
1901 // The bug affects the swizzling of SVS accesses if there is any carry out
1902 // from the two low order bits (i.e. from bit 1 into bit 2) when adding
1903 // voffset to (soffset + inst_offset).
1904 KnownBits VKnown = CurDAG->computeKnownBits(VAddr);
1906 true, false, CurDAG->computeKnownBits(SAddr),
1907 KnownBits::makeConstant(APInt(32, ImmOffset)));
1908 uint64_t VMax = VKnown.getMaxValue().getZExtValue();
1910 return (VMax & 3) + (SMax & 3) >= 4;
1911}
1912
1913bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
1914 SDValue &VAddr, SDValue &SAddr,
1915 SDValue &Offset) const {
1916 int64_t ImmOffset = 0;
1917
1918 SDValue LHS, RHS;
1919 SDValue OrigAddr = Addr;
1920 if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
1921 int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
1922 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1923
1924 if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, true)) {
1925 Addr = LHS;
1926 ImmOffset = COffsetVal;
1927 } else if (!LHS->isDivergent() && COffsetVal > 0) {
1928 SDLoc SL(N);
1929 // saddr + large_offset -> saddr + (vaddr = large_offset & ~MaxOffset) +
1930 // (large_offset & MaxOffset);
1931 int64_t SplitImmOffset, RemainderOffset;
1932 std::tie(SplitImmOffset, RemainderOffset)
1933 = TII->splitFlatOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, true);
1934
1935 if (isUInt<32>(RemainderOffset)) {
1936 SDNode *VMov = CurDAG->getMachineNode(
1937 AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
1938 CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
1939 VAddr = SDValue(VMov, 0);
1940 SAddr = LHS;
1941 if (!isFlatScratchBaseLegal(Addr))
1942 return false;
1943 if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, SplitImmOffset))
1944 return false;
1945 Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i16);
1946 return true;
1947 }
1948 }
1949 }
1950
1951 if (Addr.getOpcode() != ISD::ADD)
1952 return false;
1953
1954 LHS = Addr.getOperand(0);
1955 RHS = Addr.getOperand(1);
1956
1957 if (!LHS->isDivergent() && RHS->isDivergent()) {
1958 SAddr = LHS;
1959 VAddr = RHS;
1960 } else if (!RHS->isDivergent() && LHS->isDivergent()) {
1961 SAddr = RHS;
1962 VAddr = LHS;
1963 } else {
1964 return false;
1965 }
1966
1967 if (OrigAddr != Addr) {
1968 if (!isFlatScratchBaseLegalSVImm(OrigAddr))
1969 return false;
1970 } else {
1971 if (!isFlatScratchBaseLegalSV(OrigAddr))
1972 return false;
1973 }
1974
1975 if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, ImmOffset))
1976 return false;
1977 SAddr = SelectSAddrFI(CurDAG, SAddr);
1978 Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16);
1979 return true;
1980}
1981
1982// Match an immediate (if Offset is not null) or an SGPR (if SOffset is
1983// not null) offset. If Imm32Only is true, match only 32-bit immediate
1984// offsets available on CI.
1985bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
1986 SDValue *SOffset, SDValue *Offset,
1987 bool Imm32Only, bool IsBuffer) const {
1988 assert((!SOffset || !Offset) &&
1989 "Cannot match both soffset and offset at the same time!");
1990
1991 ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode);
1992 if (!C) {
1993 if (!SOffset)
1994 return false;
1995 if (ByteOffsetNode.getValueType().isScalarInteger() &&
1996 ByteOffsetNode.getValueType().getSizeInBits() == 32) {
1997 *SOffset = ByteOffsetNode;
1998 return true;
1999 }
2000 if (ByteOffsetNode.getOpcode() == ISD::ZERO_EXTEND) {
2001 if (ByteOffsetNode.getOperand(0).getValueType().getSizeInBits() == 32) {
2002 *SOffset = ByteOffsetNode.getOperand(0);
2003 return true;
2004 }
2005 }
2006 return false;
2007 }
2008
2009 SDLoc SL(ByteOffsetNode);
2010
2011 // GFX9 and GFX10 have signed byte immediate offsets. The immediate
2012 // offset for S_BUFFER instructions is unsigned.
2013 int64_t ByteOffset = IsBuffer ? C->getZExtValue() : C->getSExtValue();
2014 std::optional<int64_t> EncodedOffset =
2015 AMDGPU::getSMRDEncodedOffset(*Subtarget, ByteOffset, IsBuffer);
2016 if (EncodedOffset && Offset && !Imm32Only) {
2017 *Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);
2018 return true;
2019 }
2020
2021 // SGPR and literal offsets are unsigned.
2022 if (ByteOffset < 0)
2023 return false;
2024
2025 EncodedOffset = AMDGPU::getSMRDEncodedLiteralOffset32(*Subtarget, ByteOffset);
2026 if (EncodedOffset && Offset && Imm32Only) {
2027 *Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);
2028 return true;
2029 }
2030
2031 if (!isUInt<32>(ByteOffset) && !isInt<32>(ByteOffset))
2032 return false;
2033
2034 if (SOffset) {
2035 SDValue C32Bit = CurDAG->getTargetConstant(ByteOffset, SL, MVT::i32);
2036 *SOffset = SDValue(
2037 CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, C32Bit), 0);
2038 return true;
2039 }
2040
2041 return false;
2042}
2043
2044SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const {
2045 if (Addr.getValueType() != MVT::i32)
2046 return Addr;
2047
2048 // Zero-extend a 32-bit address.
2049 SDLoc SL(Addr);
2050
2053 unsigned AddrHiVal = Info->get32BitAddressHighBits();
2054 SDValue AddrHi = CurDAG->getTargetConstant(AddrHiVal, SL, MVT::i32);
2055
2056 const SDValue Ops[] = {
2057 CurDAG->getTargetConstant(AMDGPU::SReg_64_XEXECRegClassID, SL, MVT::i32),
2058 Addr,
2059 CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
2060 SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, AddrHi),
2061 0),
2062 CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32),
2063 };
2064
2065 return SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, SL, MVT::i64,
2066 Ops), 0);
2067}
2068
2069// Match a base and an immediate (if Offset is not null) or an SGPR (if
2070// SOffset is not null) or an immediate+SGPR offset. If Imm32Only is
2071// true, match only 32-bit immediate offsets available on CI.
2072bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase,
2073 SDValue *SOffset, SDValue *Offset,
2074 bool Imm32Only,
2075 bool IsBuffer) const {
2076 if (SOffset && Offset) {
2077 assert(!Imm32Only && !IsBuffer);
2078 SDValue B;
2079 return SelectSMRDBaseOffset(Addr, B, nullptr, Offset) &&
2080 SelectSMRDBaseOffset(B, SBase, SOffset, nullptr);
2081 }
2082
2083 // A 32-bit (address + offset) should not cause unsigned 32-bit integer
2084 // wraparound, because s_load instructions perform the addition in 64 bits.
2085 if (Addr.getValueType() == MVT::i32 && Addr.getOpcode() == ISD::ADD &&
2086 !Addr->getFlags().hasNoUnsignedWrap())
2087 return false;
2088
2089 SDValue N0, N1;
2090 // Extract the base and offset if possible.
2091 if (CurDAG->isBaseWithConstantOffset(Addr) || Addr.getOpcode() == ISD::ADD) {
2092 N0 = Addr.getOperand(0);
2093 N1 = Addr.getOperand(1);
2094 } else if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, N0, N1)) {
2095 assert(N0 && N1 && isa<ConstantSDNode>(N1));
2096 }
2097 if (!N0 || !N1)
2098 return false;
2099 if (SelectSMRDOffset(N1, SOffset, Offset, Imm32Only, IsBuffer)) {
2100 SBase = N0;
2101 return true;
2102 }
2103 if (SelectSMRDOffset(N0, SOffset, Offset, Imm32Only, IsBuffer)) {
2104 SBase = N1;
2105 return true;
2106 }
2107 return false;
2108}
2109
2110bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase,
2111 SDValue *SOffset, SDValue *Offset,
2112 bool Imm32Only) const {
2113 if (SelectSMRDBaseOffset(Addr, SBase, SOffset, Offset, Imm32Only)) {
2114 SBase = Expand32BitAddress(SBase);
2115 return true;
2116 }
2117
2118 if (Addr.getValueType() == MVT::i32 && Offset && !SOffset) {
2119 SBase = Expand32BitAddress(Addr);
2120 *Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
2121 return true;
2122 }
2123
2124 return false;
2125}
2126
2127bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr, SDValue &SBase,
2128 SDValue &Offset) const {
2129 return SelectSMRD(Addr, SBase, /* SOffset */ nullptr, &Offset);
2130}
2131
2132bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase,
2133 SDValue &Offset) const {
2135 return SelectSMRD(Addr, SBase, /* SOffset */ nullptr, &Offset,
2136 /* Imm32Only */ true);
2137}
2138
2139bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDValue Addr, SDValue &SBase,
2140 SDValue &SOffset) const {
2141 return SelectSMRD(Addr, SBase, &SOffset, /* Offset */ nullptr);
2142}
2143
2144bool AMDGPUDAGToDAGISel::SelectSMRDSgprImm(SDValue Addr, SDValue &SBase,
2145 SDValue &SOffset,
2146 SDValue &Offset) const {
2147 return SelectSMRD(Addr, SBase, &SOffset, &Offset);
2148}
2149
2150bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue N, SDValue &Offset) const {
2151 return SelectSMRDOffset(N, /* SOffset */ nullptr, &Offset,
2152 /* Imm32Only */ false, /* IsBuffer */ true);
2153}
2154
2155bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue N,
2156 SDValue &Offset) const {
2158 return SelectSMRDOffset(N, /* SOffset */ nullptr, &Offset,
2159 /* Imm32Only */ true, /* IsBuffer */ true);
2160}
2161
2162bool AMDGPUDAGToDAGISel::SelectSMRDBufferSgprImm(SDValue N, SDValue &SOffset,
2163 SDValue &Offset) const {
2164 // Match the (soffset + offset) pair as a 32-bit register base and
2165 // an immediate offset.
2166 return N.getValueType() == MVT::i32 &&
2167 SelectSMRDBaseOffset(N, /* SBase */ SOffset, /* SOffset*/ nullptr,
2168 &Offset, /* Imm32Only */ false,
2169 /* IsBuffer */ true);
2170}
2171
2172bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index,
2173 SDValue &Base,
2174 SDValue &Offset) const {
2175 SDLoc DL(Index);
2176
2178 SDValue N0 = Index.getOperand(0);
2179 SDValue N1 = Index.getOperand(1);
2180 ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
2181
2182 // (add n0, c0)
2183 // Don't peel off the offset (c0) if doing so could possibly lead
2184 // the base (n0) to be negative.
2185 // (or n0, |c0|) can never change a sign given isBaseWithConstantOffset.
2186 if (C1->getSExtValue() <= 0 || CurDAG->SignBitIsZero(N0) ||
2187 (Index->getOpcode() == ISD::OR && C1->getSExtValue() >= 0)) {
2188 Base = N0;
2189 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
2190 return true;
2191 }
2192 }
2193
2194 if (isa<ConstantSDNode>(Index))
2195 return false;
2196
2197 Base = Index;
2198 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
2199 return true;
2200}
2201
2202SDNode *AMDGPUDAGToDAGISel::getBFE32(bool IsSigned, const SDLoc &DL,
2203 SDValue Val, uint32_t Offset,
2204 uint32_t Width) {
2205 if (Val->isDivergent()) {
2206 unsigned Opcode = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2208 SDValue W = CurDAG->getTargetConstant(Width, DL, MVT::i32);
2209
2210 return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, Off, W);
2211 }
2212 unsigned Opcode = IsSigned ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2213 // Transformation function, pack the offset and width of a BFE into
2214 // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
2215 // source, bits [5:0] contain the offset and bits [22:16] the width.
2216 uint32_t PackedVal = Offset | (Width << 16);
2217 SDValue PackedConst = CurDAG->getTargetConstant(PackedVal, DL, MVT::i32);
2218
2219 return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, PackedConst);
2220}
2221
2222void AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode *N) {
2223 // "(a << b) srl c)" ---> "BFE_U32 a, (c-b), (32-c)
2224 // "(a << b) sra c)" ---> "BFE_I32 a, (c-b), (32-c)
2225 // Predicate: 0 < b <= c < 32
2226
2227 const SDValue &Shl = N->getOperand(0);
2228 ConstantSDNode *B = dyn_cast<ConstantSDNode>(Shl->getOperand(1));
2229 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
2230
2231 if (B && C) {
2232 uint32_t BVal = B->getZExtValue();
2233 uint32_t CVal = C->getZExtValue();
2234
2235 if (0 < BVal && BVal <= CVal && CVal < 32) {
2236 bool Signed = N->getOpcode() == ISD::SRA;
2237 ReplaceNode(N, getBFE32(Signed, SDLoc(N), Shl.getOperand(0), CVal - BVal,
2238 32 - CVal));
2239 return;
2240 }
2241 }
2242 SelectCode(N);
2243}
2244
2245void AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) {
2246 switch (N->getOpcode()) {
2247 case ISD::AND:
2248 if (N->getOperand(0).getOpcode() == ISD::SRL) {
2249 // "(a srl b) & mask" ---> "BFE_U32 a, b, popcount(mask)"
2250 // Predicate: isMask(mask)
2251 const SDValue &Srl = N->getOperand(0);
2252 ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(Srl.getOperand(1));
2253 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(N->getOperand(1));
2254
2255 if (Shift && Mask) {
2256 uint32_t ShiftVal = Shift->getZExtValue();
2257 uint32_t MaskVal = Mask->getZExtValue();
2258
2259 if (isMask_32(MaskVal)) {
2260 uint32_t WidthVal = llvm::popcount(MaskVal);
2261 ReplaceNode(N, getBFE32(false, SDLoc(N), Srl.getOperand(0), ShiftVal,
2262 WidthVal));
2263 return;
2264 }
2265 }
2266 }
2267 break;
2268 case ISD::SRL:
2269 if (N->getOperand(0).getOpcode() == ISD::AND) {
2270 // "(a & mask) srl b)" ---> "BFE_U32 a, b, popcount(mask >> b)"
2271 // Predicate: isMask(mask >> b)
2272 const SDValue &And = N->getOperand(0);
2273 ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(N->getOperand(1));
2274 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(And->getOperand(1));
2275
2276 if (Shift && Mask) {
2277 uint32_t ShiftVal = Shift->getZExtValue();
2278 uint32_t MaskVal = Mask->getZExtValue() >> ShiftVal;
2279
2280 if (isMask_32(MaskVal)) {
2281 uint32_t WidthVal = llvm::popcount(MaskVal);
2282 ReplaceNode(N, getBFE32(false, SDLoc(N), And.getOperand(0), ShiftVal,
2283 WidthVal));
2284 return;
2285 }
2286 }
2287 } else if (N->getOperand(0).getOpcode() == ISD::SHL) {
2288 SelectS_BFEFromShifts(N);
2289 return;
2290 }
2291 break;
2292 case ISD::SRA:
2293 if (N->getOperand(0).getOpcode() == ISD::SHL) {
2294 SelectS_BFEFromShifts(N);
2295 return;
2296 }
2297 break;
2298
2300 // sext_inreg (srl x, 16), i8 -> bfe_i32 x, 16, 8
2301 SDValue Src = N->getOperand(0);
2302 if (Src.getOpcode() != ISD::SRL)
2303 break;
2304
2305 const ConstantSDNode *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(1));
2306 if (!Amt)
2307 break;
2308
2309 unsigned Width = cast<VTSDNode>(N->getOperand(1))->getVT().getSizeInBits();
2310 ReplaceNode(N, getBFE32(true, SDLoc(N), Src.getOperand(0),
2311 Amt->getZExtValue(), Width));
2312 return;
2313 }
2314 }
2315
2316 SelectCode(N);
2317}
2318
2319bool AMDGPUDAGToDAGISel::isCBranchSCC(const SDNode *N) const {
2320 assert(N->getOpcode() == ISD::BRCOND);
2321 if (!N->hasOneUse())
2322 return false;
2323
2324 SDValue Cond = N->getOperand(1);
2325 if (Cond.getOpcode() == ISD::CopyToReg)
2326 Cond = Cond.getOperand(2);
2327
2328 if (Cond.getOpcode() != ISD::SETCC || !Cond.hasOneUse())
2329 return false;
2330
2331 MVT VT = Cond.getOperand(0).getSimpleValueType();
2332 if (VT == MVT::i32)
2333 return true;
2334
2335 if (VT == MVT::i64) {
2336 auto ST = static_cast<const GCNSubtarget *>(Subtarget);
2337
2338 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
2339 return (CC == ISD::SETEQ || CC == ISD::SETNE) && ST->hasScalarCompareEq64();
2340 }
2341
2342 return false;
2343}
2344
2345static SDValue combineBallotPattern(SDValue VCMP, bool &Negate) {
2346 assert(VCMP->getOpcode() == AMDGPUISD::SETCC);
2347 // Special case for amdgcn.ballot:
2348 // %Cond = i1 (and/or combination of i1 ISD::SETCCs)
2349 // %VCMP = i(WaveSize) AMDGPUISD::SETCC (ext %Cond), 0, setne/seteq
2350 // =>
2351 // Use i1 %Cond value instead of i(WaveSize) %VCMP.
2352 // This is possible because divergent ISD::SETCC is selected as V_CMP and
2353 // Cond becomes a i(WaveSize) full mask value.
2354 // Note that ballot doesn't use SETEQ condition but its easy to support it
2355 // here for completeness, so in this case Negate is set true on return.
2356 auto VCMP_CC = cast<CondCodeSDNode>(VCMP.getOperand(2))->get();
2357 if ((VCMP_CC == ISD::SETEQ || VCMP_CC == ISD::SETNE) &&
2358 isNullConstant(VCMP.getOperand(1))) {
2359
2360 auto Cond = VCMP.getOperand(0);
2361 if (ISD::isExtOpcode(Cond->getOpcode())) // Skip extension.
2362 Cond = Cond.getOperand(0);
2363
2364 if (isBoolSGPR(Cond)) {
2365 Negate = VCMP_CC == ISD::SETEQ;
2366 return Cond;
2367 }
2368 }
2369 return SDValue();
2370}
2371
2372void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
2373 SDValue Cond = N->getOperand(1);
2374
2375 if (Cond.isUndef()) {
2376 CurDAG->SelectNodeTo(N, AMDGPU::SI_BR_UNDEF, MVT::Other,
2377 N->getOperand(2), N->getOperand(0));
2378 return;
2379 }
2380
2381 const GCNSubtarget *ST = static_cast<const GCNSubtarget *>(Subtarget);
2382 const SIRegisterInfo *TRI = ST->getRegisterInfo();
2383
2384 bool UseSCCBr = isCBranchSCC(N) && isUniformBr(N);
2385 bool AndExec = !UseSCCBr;
2386 bool Negate = false;
2387
2388 if (Cond.getOpcode() == ISD::SETCC &&
2389 Cond->getOperand(0)->getOpcode() == AMDGPUISD::SETCC) {
2390 SDValue VCMP = Cond->getOperand(0);
2391 auto CC = cast<CondCodeSDNode>(Cond->getOperand(2))->get();
2392 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
2393 isNullConstant(Cond->getOperand(1)) &&
2394 // We may encounter ballot.i64 in wave32 mode on -O0.
2395 VCMP.getValueType().getSizeInBits() == ST->getWavefrontSize()) {
2396 // %VCMP = i(WaveSize) AMDGPUISD::SETCC ...
2397 // %C = i1 ISD::SETCC %VCMP, 0, setne/seteq
2398 // BRCOND i1 %C, %BB
2399 // =>
2400 // %VCMP = i(WaveSize) AMDGPUISD::SETCC ...
2401 // VCC = COPY i(WaveSize) %VCMP
2402 // S_CBRANCH_VCCNZ/VCCZ %BB
2403 Negate = CC == ISD::SETEQ;
2404 bool NegatedBallot = false;
2405 if (auto BallotCond = combineBallotPattern(VCMP, NegatedBallot)) {
2406 Cond = BallotCond;
2407 UseSCCBr = !BallotCond->isDivergent();
2408 Negate = Negate ^ NegatedBallot;
2409 } else {
2410 // TODO: don't use SCC here assuming that AMDGPUISD::SETCC is always
2411 // selected as V_CMP, but this may change for uniform condition.
2412 Cond = VCMP;
2413 UseSCCBr = false;
2414 }
2415 }
2416 // Cond is either V_CMP resulted from AMDGPUISD::SETCC or a combination of
2417 // V_CMPs resulted from ballot or ballot has uniform condition and SCC is
2418 // used.
2419 AndExec = false;
2420 }
2421
2422 unsigned BrOp =
2423 UseSCCBr ? (Negate ? AMDGPU::S_CBRANCH_SCC0 : AMDGPU::S_CBRANCH_SCC1)
2424 : (Negate ? AMDGPU::S_CBRANCH_VCCZ : AMDGPU::S_CBRANCH_VCCNZ);
2425 Register CondReg = UseSCCBr ? AMDGPU::SCC : TRI->getVCC();
2426 SDLoc SL(N);
2427
2428 if (AndExec) {
2429 // This is the case that we are selecting to S_CBRANCH_VCCNZ. We have not
2430 // analyzed what generates the vcc value, so we do not know whether vcc
2431 // bits for disabled lanes are 0. Thus we need to mask out bits for
2432 // disabled lanes.
2433 //
2434 // For the case that we select S_CBRANCH_SCC1 and it gets
2435 // changed to S_CBRANCH_VCCNZ in SIFixSGPRCopies, SIFixSGPRCopies calls
2436 // SIInstrInfo::moveToVALU which inserts the S_AND).
2437 //
2438 // We could add an analysis of what generates the vcc value here and omit
2439 // the S_AND when is unnecessary. But it would be better to add a separate
2440 // pass after SIFixSGPRCopies to do the unnecessary S_AND removal, so it
2441 // catches both cases.
2442 Cond = SDValue(CurDAG->getMachineNode(ST->isWave32() ? AMDGPU::S_AND_B32
2443 : AMDGPU::S_AND_B64,
2444 SL, MVT::i1,
2445 CurDAG->getRegister(ST->isWave32() ? AMDGPU::EXEC_LO
2446 : AMDGPU::EXEC,
2447 MVT::i1),
2448 Cond),
2449 0);
2450 }
2451
2452 SDValue VCC = CurDAG->getCopyToReg(N->getOperand(0), SL, CondReg, Cond);
2453 CurDAG->SelectNodeTo(N, BrOp, MVT::Other,
2454 N->getOperand(2), // Basic Block
2455 VCC.getValue(0));
2456}
2457
2458void AMDGPUDAGToDAGISel::SelectFP_EXTEND(SDNode *N) {
2459 if (Subtarget->hasSALUFloatInsts() && N->getValueType(0) == MVT::f32 &&
2460 !N->isDivergent()) {
2461 SDValue Src = N->getOperand(0);
2462 if (Src.getValueType() == MVT::f16) {
2463 if (isExtractHiElt(Src, Src)) {
2464 CurDAG->SelectNodeTo(N, AMDGPU::S_CVT_HI_F32_F16, N->getVTList(),
2465 {Src});
2466 return;
2467 }
2468 }
2469 }
2470
2471 SelectCode(N);
2472}
2473
2474void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode *N, unsigned IntrID) {
2475 // The address is assumed to be uniform, so if it ends up in a VGPR, it will
2476 // be copied to an SGPR with readfirstlane.
2477 unsigned Opc = IntrID == Intrinsic::amdgcn_ds_append ?
2478 AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
2479
2480 SDValue Chain = N->getOperand(0);
2481 SDValue Ptr = N->getOperand(2);
2482 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2483 MachineMemOperand *MMO = M->getMemOperand();
2484 bool IsGDS = M->getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
2485
2488 SDValue PtrBase = Ptr.getOperand(0);
2489 SDValue PtrOffset = Ptr.getOperand(1);
2490
2491 const APInt &OffsetVal = PtrOffset->getAsAPIntVal();
2492 if (isDSOffsetLegal(PtrBase, OffsetVal.getZExtValue())) {
2493 N = glueCopyToM0(N, PtrBase);
2494 Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i32);
2495 }
2496 }
2497
2498 if (!Offset) {
2499 N = glueCopyToM0(N, Ptr);
2500 Offset = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32);
2501 }
2502
2503 SDValue Ops[] = {
2504 Offset,
2505 CurDAG->getTargetConstant(IsGDS, SDLoc(), MVT::i32),
2506 Chain,
2507 N->getOperand(N->getNumOperands() - 1) // New glue
2508 };
2509
2510 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
2511 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
2512}
2513
2514// We need to handle this here because tablegen doesn't support matching
2515// instructions with multiple outputs.
2516void AMDGPUDAGToDAGISel::SelectDSBvhStackIntrinsic(SDNode *N) {
2517 unsigned Opc = AMDGPU::DS_BVH_STACK_RTN_B32;
2518 SDValue Ops[] = {N->getOperand(2), N->getOperand(3), N->getOperand(4),
2519 N->getOperand(5), N->getOperand(0)};
2520
2521 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2522 MachineMemOperand *MMO = M->getMemOperand();
2523 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
2524 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
2525}
2526
2527static unsigned gwsIntrinToOpcode(unsigned IntrID) {
2528 switch (IntrID) {
2529 case Intrinsic::amdgcn_ds_gws_init:
2530 return AMDGPU::DS_GWS_INIT;
2531 case Intrinsic::amdgcn_ds_gws_barrier:
2532 return AMDGPU::DS_GWS_BARRIER;
2533 case Intrinsic::amdgcn_ds_gws_sema_v:
2534 return AMDGPU::DS_GWS_SEMA_V;
2535 case Intrinsic::amdgcn_ds_gws_sema_br:
2536 return AMDGPU::DS_GWS_SEMA_BR;
2537 case Intrinsic::amdgcn_ds_gws_sema_p:
2538 return AMDGPU::DS_GWS_SEMA_P;
2539 case Intrinsic::amdgcn_ds_gws_sema_release_all:
2540 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
2541 default:
2542 llvm_unreachable("not a gws intrinsic");
2543 }
2544}
2545
2546void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) {
2547 if (!Subtarget->hasGWS() ||
2548 (IntrID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
2549 !Subtarget->hasGWSSemaReleaseAll())) {
2550 // Let this error.
2551 SelectCode(N);
2552 return;
2553 }
2554
2555 // Chain, intrinsic ID, vsrc, offset
2556 const bool HasVSrc = N->getNumOperands() == 4;
2557 assert(HasVSrc || N->getNumOperands() == 3);
2558
2559 SDLoc SL(N);
2560 SDValue BaseOffset = N->getOperand(HasVSrc ? 3 : 2);
2561 int ImmOffset = 0;
2562 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2563 MachineMemOperand *MMO = M->getMemOperand();
2564
2565 // Don't worry if the offset ends up in a VGPR. Only one lane will have
2566 // effect, so SIFixSGPRCopies will validly insert readfirstlane.
2567
2568 // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
2569 // offset field) % 64. Some versions of the programming guide omit the m0
2570 // part, or claim it's from offset 0.
2571 if (ConstantSDNode *ConstOffset = dyn_cast<ConstantSDNode>(BaseOffset)) {
2572 // If we have a constant offset, try to use the 0 in m0 as the base.
2573 // TODO: Look into changing the default m0 initialization value. If the
2574 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
2575 // the immediate offset.
2576 glueCopyToM0(N, CurDAG->getTargetConstant(0, SL, MVT::i32));
2577 ImmOffset = ConstOffset->getZExtValue();
2578 } else {
2579 if (CurDAG->isBaseWithConstantOffset(BaseOffset)) {
2580 ImmOffset = BaseOffset.getConstantOperandVal(1);
2581 BaseOffset = BaseOffset.getOperand(0);
2582 }
2583
2584 // Prefer to do the shift in an SGPR since it should be possible to use m0
2585 // as the result directly. If it's already an SGPR, it will be eliminated
2586 // later.
2587 SDNode *SGPROffset
2588 = CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL, MVT::i32,
2589 BaseOffset);
2590 // Shift to offset in m0
2591 SDNode *M0Base
2592 = CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,
2593 SDValue(SGPROffset, 0),
2594 CurDAG->getTargetConstant(16, SL, MVT::i32));
2595 glueCopyToM0(N, SDValue(M0Base, 0));
2596 }
2597
2598 SDValue Chain = N->getOperand(0);
2599 SDValue OffsetField = CurDAG->getTargetConstant(ImmOffset, SL, MVT::i32);
2600
2601 const unsigned Opc = gwsIntrinToOpcode(IntrID);
2603 if (HasVSrc)
2604 Ops.push_back(N->getOperand(2));
2605 Ops.push_back(OffsetField);
2606 Ops.push_back(Chain);
2607
2608 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
2609 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
2610}
2611
2612void AMDGPUDAGToDAGISel::SelectInterpP1F16(SDNode *N) {
2613 if (Subtarget->getLDSBankCount() != 16) {
2614 // This is a single instruction with a pattern.
2615 SelectCode(N);
2616 return;
2617 }
2618
2619 SDLoc DL(N);
2620
2621 // This requires 2 instructions. It is possible to write a pattern to support
2622 // this, but the generated isel emitter doesn't correctly deal with multiple
2623 // output instructions using the same physical register input. The copy to m0
2624 // is incorrectly placed before the second instruction.
2625 //
2626 // TODO: Match source modifiers.
2627 //
2628 // def : Pat <
2629 // (int_amdgcn_interp_p1_f16
2630 // (VOP3Mods f32:$src0, i32:$src0_modifiers),
2631 // (i32 timm:$attrchan), (i32 timm:$attr),
2632 // (i1 timm:$high), M0),
2633 // (V_INTERP_P1LV_F16 $src0_modifiers, VGPR_32:$src0, timm:$attr,
2634 // timm:$attrchan, 0,
2635 // (V_INTERP_MOV_F32 2, timm:$attr, timm:$attrchan), timm:$high)> {
2636 // let Predicates = [has16BankLDS];
2637 // }
2638
2639 // 16 bank LDS
2640 SDValue ToM0 = CurDAG->getCopyToReg(CurDAG->getEntryNode(), DL, AMDGPU::M0,
2641 N->getOperand(5), SDValue());
2642
2643 SDVTList VTs = CurDAG->getVTList(MVT::f32, MVT::Other);
2644
2645 SDNode *InterpMov =
2646 CurDAG->getMachineNode(AMDGPU::V_INTERP_MOV_F32, DL, VTs, {
2647 CurDAG->getTargetConstant(2, DL, MVT::i32), // P0
2648 N->getOperand(3), // Attr
2649 N->getOperand(2), // Attrchan
2650 ToM0.getValue(1) // In glue
2651 });
2652
2653 SDNode *InterpP1LV =
2654 CurDAG->getMachineNode(AMDGPU::V_INTERP_P1LV_F16, DL, MVT::f32, {
2655 CurDAG->getTargetConstant(0, DL, MVT::i32), // $src0_modifiers
2656 N->getOperand(1), // Src0
2657 N->getOperand(3), // Attr
2658 N->getOperand(2), // Attrchan
2659 CurDAG->getTargetConstant(0, DL, MVT::i32), // $src2_modifiers
2660 SDValue(InterpMov, 0), // Src2 - holds two f16 values selected by high
2661 N->getOperand(4), // high
2662 CurDAG->getTargetConstant(0, DL, MVT::i1), // $clamp
2663 CurDAG->getTargetConstant(0, DL, MVT::i32), // $omod
2664 SDValue(InterpMov, 1)
2665 });
2666
2667 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), SDValue(InterpP1LV, 0));
2668}
2669
2670void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
2671 unsigned IntrID = N->getConstantOperandVal(1);
2672 switch (IntrID) {
2673 case Intrinsic::amdgcn_ds_append:
2674 case Intrinsic::amdgcn_ds_consume: {
2675 if (N->getValueType(0) != MVT::i32)
2676 break;
2677 SelectDSAppendConsume(N, IntrID);
2678 return;
2679 }
2680 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2681 SelectDSBvhStackIntrinsic(N);
2682 return;
2683 }
2684
2685 SelectCode(N);
2686}
2687
2688void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) {
2689 unsigned IntrID = N->getConstantOperandVal(0);
2690 unsigned Opcode;
2691 switch (IntrID) {
2692 case Intrinsic::amdgcn_wqm:
2693 Opcode = AMDGPU::WQM;
2694 break;
2695 case Intrinsic::amdgcn_softwqm:
2696 Opcode = AMDGPU::SOFT_WQM;
2697 break;
2698 case Intrinsic::amdgcn_wwm:
2699 case Intrinsic::amdgcn_strict_wwm:
2700 Opcode = AMDGPU::STRICT_WWM;
2701 break;
2702 case Intrinsic::amdgcn_strict_wqm:
2703 Opcode = AMDGPU::STRICT_WQM;
2704 break;
2705 case Intrinsic::amdgcn_interp_p1_f16:
2706 SelectInterpP1F16(N);
2707 return;
2708 case Intrinsic::amdgcn_inverse_ballot:
2709 switch (N->getOperand(1).getValueSizeInBits()) {
2710 case 32:
2711 Opcode = AMDGPU::S_INVERSE_BALLOT_U32;
2712 break;
2713 case 64:
2714 Opcode = AMDGPU::S_INVERSE_BALLOT_U64;
2715 break;
2716 default:
2717 llvm_unreachable("Unsupported size for inverse ballot mask.");
2718 }
2719 break;
2720 default:
2721 SelectCode(N);
2722 return;
2723 }
2724
2725 SDValue Src = N->getOperand(1);
2726 CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), {Src});
2727}
2728
2729void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) {
2730 unsigned IntrID = N->getConstantOperandVal(1);
2731 switch (IntrID) {
2732 case Intrinsic::amdgcn_ds_gws_init:
2733 case Intrinsic::amdgcn_ds_gws_barrier:
2734 case Intrinsic::amdgcn_ds_gws_sema_v:
2735 case Intrinsic::amdgcn_ds_gws_sema_br:
2736 case Intrinsic::amdgcn_ds_gws_sema_p:
2737 case Intrinsic::amdgcn_ds_gws_sema_release_all:
2738 SelectDS_GWS(N, IntrID);
2739 return;
2740 default:
2741 break;
2742 }
2743
2744 SelectCode(N);
2745}
2746
2747void AMDGPUDAGToDAGISel::SelectWAVE_ADDRESS(SDNode *N) {
2748 SDValue Log2WaveSize =
2749 CurDAG->getTargetConstant(Subtarget->getWavefrontSizeLog2(), SDLoc(N), MVT::i32);
2750 CurDAG->SelectNodeTo(N, AMDGPU::S_LSHR_B32, N->getVTList(),
2751 {N->getOperand(0), Log2WaveSize});
2752}
2753
2754void AMDGPUDAGToDAGISel::SelectSTACKRESTORE(SDNode *N) {
2755 SDValue SrcVal = N->getOperand(1);
2756 if (SrcVal.getValueType() != MVT::i32) {
2757 SelectCode(N); // Emit default error
2758 return;
2759 }
2760
2761 SDValue CopyVal;
2763 SDLoc SL(N);
2764
2765 if (SrcVal.getOpcode() == AMDGPUISD::WAVE_ADDRESS) {
2766 CopyVal = SrcVal.getOperand(0);
2767 } else {
2768 SDValue Log2WaveSize = CurDAG->getTargetConstant(
2769 Subtarget->getWavefrontSizeLog2(), SL, MVT::i32);
2770
2771 if (N->isDivergent()) {
2772 SrcVal = SDValue(CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL,
2773 MVT::i32, SrcVal),
2774 0);
2775 }
2776
2777 CopyVal = SDValue(CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,
2778 {SrcVal, Log2WaveSize}),
2779 0);
2780 }
2781
2782 SDValue CopyToSP = CurDAG->getCopyToReg(N->getOperand(0), SL, SP, CopyVal);
2783 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), CopyToSP);
2784}
2785
2786bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src,
2787 unsigned &Mods,
2788 bool IsCanonicalizing,
2789 bool AllowAbs) const {
2790 Mods = SISrcMods::NONE;
2791 Src = In;
2792
2793 if (Src.getOpcode() == ISD::FNEG) {
2794 Mods |= SISrcMods::NEG;
2795 Src = Src.getOperand(0);
2796 } else if (Src.getOpcode() == ISD::FSUB && IsCanonicalizing) {
2797 // Fold fsub [+-]0 into fneg. This may not have folded depending on the
2798 // denormal mode, but we're implicitly canonicalizing in a source operand.
2799 auto *LHS = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
2800 if (LHS && LHS->isZero()) {
2801 Mods |= SISrcMods::NEG;
2802 Src = Src.getOperand(1);
2803 }
2804 }
2805
2806 if (AllowAbs && Src.getOpcode() == ISD::FABS) {
2807 Mods |= SISrcMods::ABS;
2808 Src = Src.getOperand(0);
2809 }
2810
2811 return true;
2812}
2813
2814bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src,
2815 SDValue &SrcMods) const {
2816 unsigned Mods;
2817 if (SelectVOP3ModsImpl(In, Src, Mods, /*IsCanonicalizing=*/true,
2818 /*AllowAbs=*/true)) {
2819 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2820 return true;
2821 }
2822
2823 return false;
2824}
2825
2826bool AMDGPUDAGToDAGISel::SelectVOP3ModsNonCanonicalizing(
2827 SDValue In, SDValue &Src, SDValue &SrcMods) const {
2828 unsigned Mods;
2829 if (SelectVOP3ModsImpl(In, Src, Mods, /*IsCanonicalizing=*/false,
2830 /*AllowAbs=*/true)) {
2831 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2832 return true;
2833 }
2834
2835 return false;
2836}
2837
2838bool AMDGPUDAGToDAGISel::SelectVOP3BMods(SDValue In, SDValue &Src,
2839 SDValue &SrcMods) const {
2840 unsigned Mods;
2841 if (SelectVOP3ModsImpl(In, Src, Mods,
2842 /*IsCanonicalizing=*/true,
2843 /*AllowAbs=*/false)) {
2844 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2845 return true;
2846 }
2847
2848 return false;
2849}
2850
2851bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src) const {
2852 if (In.getOpcode() == ISD::FABS || In.getOpcode() == ISD::FNEG)
2853 return false;
2854
2855 Src = In;
2856 return true;
2857}
2858
2859bool AMDGPUDAGToDAGISel::SelectVINTERPModsImpl(SDValue In, SDValue &Src,
2860 SDValue &SrcMods,
2861 bool OpSel) const {
2862 unsigned Mods;
2863 if (SelectVOP3ModsImpl(In, Src, Mods,
2864 /*IsCanonicalizing=*/true,
2865 /*AllowAbs=*/false)) {
2866 if (OpSel)
2867 Mods |= SISrcMods::OP_SEL_0;
2868 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2869 return true;
2870 }
2871
2872 return false;
2873}
2874
2875bool AMDGPUDAGToDAGISel::SelectVINTERPMods(SDValue In, SDValue &Src,
2876 SDValue &SrcMods) const {
2877 return SelectVINTERPModsImpl(In, Src, SrcMods, /* OpSel */ false);
2878}
2879
2880bool AMDGPUDAGToDAGISel::SelectVINTERPModsHi(SDValue In, SDValue &Src,
2881 SDValue &SrcMods) const {
2882 return SelectVINTERPModsImpl(In, Src, SrcMods, /* OpSel */ true);
2883}
2884
2885bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src,
2886 SDValue &SrcMods, SDValue &Clamp,
2887 SDValue &Omod) const {
2888 SDLoc DL(In);
2889 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
2890 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
2891
2892 return SelectVOP3Mods(In, Src, SrcMods);
2893}
2894
2895bool AMDGPUDAGToDAGISel::SelectVOP3BMods0(SDValue In, SDValue &Src,
2896 SDValue &SrcMods, SDValue &Clamp,
2897 SDValue &Omod) const {
2898 SDLoc DL(In);
2899 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
2900 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
2901
2902 return SelectVOP3BMods(In, Src, SrcMods);
2903}
2904
2905bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In, SDValue &Src,
2906 SDValue &Clamp, SDValue &Omod) const {
2907 Src = In;
2908
2909 SDLoc DL(In);
2910 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
2911 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
2912
2913 return true;
2914}
2915
2916bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
2917 SDValue &SrcMods, bool IsDOT) const {
2918 unsigned Mods = SISrcMods::NONE;
2919 Src = In;
2920
2921 // TODO: Handle G_FSUB 0 as fneg
2922 if (Src.getOpcode() == ISD::FNEG) {
2924 Src = Src.getOperand(0);
2925 }
2926
2927 if (Src.getOpcode() == ISD::BUILD_VECTOR && Src.getNumOperands() == 2 &&
2928 (!IsDOT || !Subtarget->hasDOTOpSelHazard())) {
2929 unsigned VecMods = Mods;
2930
2931 SDValue Lo = stripBitcast(Src.getOperand(0));
2932 SDValue Hi = stripBitcast(Src.getOperand(1));
2933
2934 if (Lo.getOpcode() == ISD::FNEG) {
2935 Lo = stripBitcast(Lo.getOperand(0));
2936 Mods ^= SISrcMods::NEG;
2937 }
2938
2939 if (Hi.getOpcode() == ISD::FNEG) {
2940 Hi = stripBitcast(Hi.getOperand(0));
2941 Mods ^= SISrcMods::NEG_HI;
2942 }
2943
2944 if (isExtractHiElt(Lo, Lo))
2945 Mods |= SISrcMods::OP_SEL_0;
2946
2947 if (isExtractHiElt(Hi, Hi))
2948 Mods |= SISrcMods::OP_SEL_1;
2949
2950 unsigned VecSize = Src.getValueSizeInBits();
2951 Lo = stripExtractLoElt(Lo);
2952 Hi = stripExtractLoElt(Hi);
2953
2954 if (Lo.getValueSizeInBits() > VecSize) {
2956 (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
2957 MVT::getIntegerVT(VecSize), Lo);
2958 }
2959
2960 if (Hi.getValueSizeInBits() > VecSize) {
2962 (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
2963 MVT::getIntegerVT(VecSize), Hi);
2964 }
2965
2966 assert(Lo.getValueSizeInBits() <= VecSize &&
2967 Hi.getValueSizeInBits() <= VecSize);
2968
2969 if (Lo == Hi && !isInlineImmediate(Lo.getNode())) {
2970 // Really a scalar input. Just select from the low half of the register to
2971 // avoid packing.
2972
2973 if (VecSize == 32 || VecSize == Lo.getValueSizeInBits()) {
2974 Src = Lo;
2975 } else {
2976 assert(Lo.getValueSizeInBits() == 32 && VecSize == 64);
2977
2978 SDLoc SL(In);
2980 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, SL,
2981 Lo.getValueType()), 0);
2982 auto RC = Lo->isDivergent() ? AMDGPU::VReg_64RegClassID
2983 : AMDGPU::SReg_64RegClassID;
2984 const SDValue Ops[] = {
2985 CurDAG->getTargetConstant(RC, SL, MVT::i32),
2986 Lo, CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
2987 Undef, CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32) };
2988
2989 Src = SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, SL,
2990 Src.getValueType(), Ops), 0);
2991 }
2992 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2993 return true;
2994 }
2995
2996 if (VecSize == 64 && Lo == Hi && isa<ConstantFPSDNode>(Lo)) {
2997 uint64_t Lit = cast<ConstantFPSDNode>(Lo)->getValueAPF()
2998 .bitcastToAPInt().getZExtValue();
2999 if (AMDGPU::isInlinableLiteral32(Lit, Subtarget->hasInv2PiInlineImm())) {
3000 Src = CurDAG->getTargetConstant(Lit, SDLoc(In), MVT::i64);
3001 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3002 return true;
3003 }
3004 }
3005
3006 Mods = VecMods;
3007 }
3008
3009 // Packed instructions do not have abs modifiers.
3010 Mods |= SISrcMods::OP_SEL_1;
3011
3012 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3013 return true;
3014}
3015
3016bool AMDGPUDAGToDAGISel::SelectVOP3PModsDOT(SDValue In, SDValue &Src,
3017 SDValue &SrcMods) const {
3018 return SelectVOP3PMods(In, Src, SrcMods, true);
3019}
3020
3021bool AMDGPUDAGToDAGISel::SelectVOP3PModsNeg(SDValue In, SDValue &Src) const {
3022 const ConstantSDNode *C = cast<ConstantSDNode>(In);
3023 // Literal i1 value set in intrinsic, represents SrcMods for the next operand.
3024 // 1 promotes packed values to signed, 0 treats them as unsigned.
3025 assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value");
3026
3027 unsigned Mods = SISrcMods::OP_SEL_1;
3028 unsigned SrcSign = C->getZExtValue();
3029 if (SrcSign == 1)
3030 Mods ^= SISrcMods::NEG;
3031
3032 Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3033 return true;
3034}
3035
3036bool AMDGPUDAGToDAGISel::SelectWMMAOpSelVOP3PMods(SDValue In,
3037 SDValue &Src) const {
3038 const ConstantSDNode *C = cast<ConstantSDNode>(In);
3039 assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value");
3040
3041 unsigned Mods = SISrcMods::OP_SEL_1;
3042 unsigned SrcVal = C->getZExtValue();
3043 if (SrcVal == 1)
3044 Mods |= SISrcMods::OP_SEL_0;
3045
3046 Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3047 return true;
3048}
3049
3051 llvm::SelectionDAG *CurDAG,
3052 const SDLoc &DL) {
3053 unsigned DstRegClass;
3054 EVT DstTy;
3055 switch (Elts.size()) {
3056 case 8:
3057 DstRegClass = AMDGPU::VReg_256RegClassID;
3058 DstTy = MVT::v8i32;
3059 break;
3060 case 4:
3061 DstRegClass = AMDGPU::VReg_128RegClassID;
3062 DstTy = MVT::v4i32;
3063 break;
3064 case 2:
3065 DstRegClass = AMDGPU::VReg_64RegClassID;
3066 DstTy = MVT::v2i32;
3067 break;
3068 default:
3069 llvm_unreachable("unhandled Reg sequence size");
3070 }
3071
3073 Ops.push_back(CurDAG->getTargetConstant(DstRegClass, DL, MVT::i32));
3074 for (unsigned i = 0; i < Elts.size(); ++i) {
3075 Ops.push_back(Elts[i]);
3076 Ops.push_back(CurDAG->getTargetConstant(
3078 }
3079 return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, DstTy, Ops);
3080}
3081
3083 llvm::SelectionDAG *CurDAG,
3084 const SDLoc &DL) {
3085 SmallVector<SDValue, 8> PackedElts;
3086 assert("unhandled Reg sequence size" &&
3087 (Elts.size() == 8 || Elts.size() == 16));
3088
3089 // Pack 16-bit elements in pairs into 32-bit register. If both elements are
3090 // unpacked from 32-bit source use it, otherwise pack them using v_perm.
3091 for (unsigned i = 0; i < Elts.size(); i += 2) {
3092 SDValue LoSrc = stripExtractLoElt(stripBitcast(Elts[i]));
3093 SDValue HiSrc;
3094 if (isExtractHiElt(Elts[i + 1], HiSrc) && LoSrc == HiSrc) {
3095 PackedElts.push_back(HiSrc);
3096 } else {
3097 SDValue PackLoLo = CurDAG->getTargetConstant(0x05040100, DL, MVT::i32);
3098 MachineSDNode *Packed =
3099 CurDAG->getMachineNode(AMDGPU::V_PERM_B32_e64, DL, MVT::i32,
3100 {Elts[i + 1], Elts[i], PackLoLo});
3101 PackedElts.push_back(SDValue(Packed, 0));
3102 }
3103 }
3104
3105 return buildRegSequence32(PackedElts, CurDAG, DL);
3106}
3107
3109 llvm::SelectionDAG *CurDAG,
3110 const SDLoc &DL, unsigned ElementSize) {
3111 if (ElementSize == 16)
3112 return buildRegSequence16(Elts, CurDAG, DL);
3113 if (ElementSize == 32)
3114 return buildRegSequence32(Elts, CurDAG, DL);
3115 llvm_unreachable("Unhandled element size");
3116}
3117
3118static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods,
3120 llvm::SelectionDAG *CurDAG, const SDLoc &DL,
3121 unsigned ElementSize) {
3122 if (ModOpcode == ISD::FNEG) {
3123 Mods |= SISrcMods::NEG;
3124 // Check if all elements also have abs modifier
3125 SmallVector<SDValue, 8> NegAbsElts;
3126 for (auto El : Elts) {
3127 if (El.getOpcode() != ISD::FABS)
3128 break;
3129 NegAbsElts.push_back(El->getOperand(0));
3130 }
3131 if (Elts.size() != NegAbsElts.size()) {
3132 // Neg
3133 Src = SDValue(buildRegSequence(Elts, CurDAG, DL, ElementSize), 0);
3134 } else {
3135 // Neg and Abs
3136 Mods |= SISrcMods::NEG_HI;
3137 Src = SDValue(buildRegSequence(NegAbsElts, CurDAG, DL, ElementSize), 0);
3138 }
3139 } else {
3140 assert(ModOpcode == ISD::FABS);
3141 // Abs
3142 Mods |= SISrcMods::NEG_HI;
3143 Src = SDValue(buildRegSequence(Elts, CurDAG, DL, ElementSize), 0);
3144 }
3145}
3146
3147// Check all f16 elements for modifiers while looking through b32 and v2b16
3148// build vector, stop if element does not satisfy ModifierCheck.
3149static void
3151 std::function<bool(SDValue)> ModifierCheck) {
3152 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3153 if (auto *F16Pair =
3154 dyn_cast<BuildVectorSDNode>(stripBitcast(BV->getOperand(i)))) {
3155 for (unsigned i = 0; i < F16Pair->getNumOperands(); ++i) {
3156 SDValue ElF16 = stripBitcast(F16Pair->getOperand(i));
3157 if (!ModifierCheck(ElF16))
3158 break;
3159 }
3160 }
3161 }
3162}
3163
3164bool AMDGPUDAGToDAGISel::SelectWMMAModsF16Neg(SDValue In, SDValue &Src,
3165 SDValue &SrcMods) const {
3166 Src = In;
3167 unsigned Mods = SISrcMods::OP_SEL_1;
3168
3169 // mods are on f16 elements
3170 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3172
3173 checkWMMAElementsModifiersF16(BV, [&](SDValue Element) -> bool {
3174 if (Element.getOpcode() != ISD::FNEG)
3175 return false;
3176 EltsF16.push_back(Element.getOperand(0));
3177 return true;
3178 });
3179
3180 // All elements have neg modifier
3181 if (BV->getNumOperands() * 2 == EltsF16.size()) {
3182 Src = SDValue(buildRegSequence16(EltsF16, CurDAG, SDLoc(In)), 0);
3183 Mods |= SISrcMods::NEG;
3184 Mods |= SISrcMods::NEG_HI;
3185 }
3186 }
3187
3188 // mods are on v2f16 elements
3189 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3190 SmallVector<SDValue, 8> EltsV2F16;
3191 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3192 SDValue ElV2f16 = stripBitcast(BV->getOperand(i));
3193 // Based on first element decide which mod we match, neg or abs
3194 if (ElV2f16.getOpcode() != ISD::FNEG)
3195 break;
3196 EltsV2F16.push_back(ElV2f16.getOperand(0));
3197 }
3198
3199 // All pairs of elements have neg modifier
3200 if (BV->getNumOperands() == EltsV2F16.size()) {
3201 Src = SDValue(buildRegSequence32(EltsV2F16, CurDAG, SDLoc(In)), 0);
3202 Mods |= SISrcMods::NEG;
3203 Mods |= SISrcMods::NEG_HI;
3204 }
3205 }
3206
3207 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3208 return true;
3209}
3210
3211bool AMDGPUDAGToDAGISel::SelectWMMAModsF16NegAbs(SDValue In, SDValue &Src,
3212 SDValue &SrcMods) const {
3213 Src = In;
3214 unsigned Mods = SISrcMods::OP_SEL_1;
3215 unsigned ModOpcode;
3216
3217 // mods are on f16 elements
3218 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3220 checkWMMAElementsModifiersF16(BV, [&](SDValue ElF16) -> bool {
3221 // Based on first element decide which mod we match, neg or abs
3222 if (EltsF16.empty())
3223 ModOpcode = (ElF16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3224 if (ElF16.getOpcode() != ModOpcode)
3225 return false;
3226 EltsF16.push_back(ElF16.getOperand(0));
3227 return true;
3228 });
3229
3230 // All elements have ModOpcode modifier
3231 if (BV->getNumOperands() * 2 == EltsF16.size())
3232 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF16, Src, CurDAG, SDLoc(In),
3233 16);
3234 }
3235
3236 // mods are on v2f16 elements
3237 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3238 SmallVector<SDValue, 8> EltsV2F16;
3239
3240 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3241 SDValue ElV2f16 = stripBitcast(BV->getOperand(i));
3242 // Based on first element decide which mod we match, neg or abs
3243 if (EltsV2F16.empty())
3244 ModOpcode = (ElV2f16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3245 if (ElV2f16->getOpcode() != ModOpcode)
3246 break;
3247 EltsV2F16.push_back(ElV2f16->getOperand(0));
3248 }
3249
3250 // All elements have ModOpcode modifier
3251 if (BV->getNumOperands() == EltsV2F16.size())
3252 selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, CurDAG, SDLoc(In),
3253 32);
3254 }
3255
3256 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3257 return true;
3258}
3259
3260bool AMDGPUDAGToDAGISel::SelectWMMAModsF32NegAbs(SDValue In, SDValue &Src,
3261 SDValue &SrcMods) const {
3262 Src = In;
3263 unsigned Mods = SISrcMods::OP_SEL_1;
3265
3266 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3267 assert(BV->getNumOperands() > 0);
3268 // Based on first element decide which mod we match, neg or abs
3269 SDValue ElF32 = stripBitcast(BV->getOperand(0));
3270 unsigned ModOpcode =
3271 (ElF32.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3272 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3273 SDValue ElF32 = stripBitcast(BV->getOperand(i));
3274 if (ElF32.getOpcode() != ModOpcode)
3275 break;
3276 EltsF32.push_back(ElF32.getOperand(0));
3277 }
3278
3279 // All elements had ModOpcode modifier
3280 if (BV->getNumOperands() == EltsF32.size())
3281 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, CurDAG, SDLoc(In),
3282 32);
3283 }
3284
3285 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3286 return true;
3287}
3288
3289bool AMDGPUDAGToDAGISel::SelectWMMAVISrc(SDValue In, SDValue &Src) const {
3290 if (auto *BV = dyn_cast<BuildVectorSDNode>(In)) {
3291 BitVector UndefElements;
3292 if (SDValue Splat = BV->getSplatValue(&UndefElements))
3293 if (isInlineImmediate(Splat.getNode())) {
3294 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Splat)) {
3295 unsigned Imm = C->getAPIntValue().getSExtValue();
3296 Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32);
3297 return true;
3298 }
3299 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Splat)) {
3300 unsigned Imm = C->getValueAPF().bitcastToAPInt().getSExtValue();
3301 Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32);
3302 return true;
3303 }
3304 llvm_unreachable("unhandled Constant node");
3305 }
3306 }
3307
3308 // 16 bit splat
3309 SDValue SplatSrc32 = stripBitcast(In);
3310 if (auto *SplatSrc32BV = dyn_cast<BuildVectorSDNode>(SplatSrc32)) {
3311 if (SDValue Splat32 = SplatSrc32BV->getSplatValue()) {
3312 SDValue SplatSrc16 = stripBitcast(Splat32);
3313 if (auto *SplatSrc16BV = dyn_cast<BuildVectorSDNode>(SplatSrc16)) {
3314 if (SDValue Splat = SplatSrc16BV->getSplatValue()) {
3315
3316 // f16
3317 if (isInlineImmediate(Splat.getNode())) {
3318 const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Splat);
3319 int64_t Imm = C->getValueAPF().bitcastToAPInt().getSExtValue();
3320 Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i16);
3321 return true;
3322 }
3323
3324 // bf16
3325 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Splat)) {
3326 const SIInstrInfo *TII = Subtarget->getInstrInfo();
3327 APInt BF16Value = C->getAPIntValue();
3328 APInt F32Value = BF16Value.zext(32).shl(16);
3329 if (TII->isInlineConstant(F32Value)) {
3330 int64_t Imm = F32Value.getSExtValue();
3331 Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32);
3332 return true;
3333 }
3334 }
3335 }
3336 }
3337 }
3338 }
3339
3340 return false;
3341}
3342
3343bool AMDGPUDAGToDAGISel::SelectSWMMACIndex8(SDValue In, SDValue &Src,
3344 SDValue &IndexKey) const {
3345 unsigned Key = 0;
3346 Src = In;
3347
3348 if (In.getOpcode() == ISD::SRL) {
3349 const llvm::SDValue &ShiftSrc = In.getOperand(0);
3350 ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(In.getOperand(1));
3351 if (ShiftSrc.getValueType().getSizeInBits() == 32 && ShiftAmt &&
3352 ShiftAmt->getZExtValue() % 8 == 0) {
3353 Key = ShiftAmt->getZExtValue() / 8;
3354 Src = ShiftSrc;
3355 }
3356 }
3357
3358 IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);
3359 return true;
3360}
3361
3362bool AMDGPUDAGToDAGISel::SelectSWMMACIndex16(SDValue In, SDValue &Src,
3363 SDValue &IndexKey) const {
3364 unsigned Key = 0;
3365 Src = In;
3366
3367 if (In.getOpcode() == ISD::SRL) {
3368 const llvm::SDValue &ShiftSrc = In.getOperand(0);
3369 ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(In.getOperand(1));
3370 if (ShiftSrc.getValueType().getSizeInBits() == 32 && ShiftAmt &&
3371 ShiftAmt->getZExtValue() == 16) {
3372 Key = 1;
3373 Src = ShiftSrc;
3374 }
3375 }
3376
3377 IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);
3378 return true;
3379}
3380
3381bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src,
3382 SDValue &SrcMods) const {
3383 Src = In;
3384 // FIXME: Handle op_sel
3385 SrcMods = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32);
3386 return true;
3387}
3388
3389bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods(SDValue In, SDValue &Src,
3390 SDValue &SrcMods) const {
3391 // FIXME: Handle op_sel
3392 return SelectVOP3Mods(In, Src, SrcMods);
3393}
3394
3395// The return value is not whether the match is possible (which it always is),
3396// but whether or not it a conversion is really used.
3397bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src,
3398 unsigned &Mods) const {
3399 Mods = 0;
3400 SelectVOP3ModsImpl(In, Src, Mods);
3401
3402 if (Src.getOpcode() == ISD::FP_EXTEND) {
3403 Src = Src.getOperand(0);
3404 assert(Src.getValueType() == MVT::f16);
3405 Src = stripBitcast(Src);
3406
3407 // Be careful about folding modifiers if we already have an abs. fneg is
3408 // applied last, so we don't want to apply an earlier fneg.
3409 if ((Mods & SISrcMods::ABS) == 0) {
3410 unsigned ModsTmp;
3411 SelectVOP3ModsImpl(Src, Src, ModsTmp);
3412
3413 if ((ModsTmp & SISrcMods::NEG) != 0)
3414 Mods ^= SISrcMods::NEG;
3415
3416 if ((ModsTmp & SISrcMods::ABS) != 0)
3417 Mods |= SISrcMods::ABS;
3418 }
3419
3420 // op_sel/op_sel_hi decide the source type and source.
3421 // If the source's op_sel_hi is set, it indicates to do a conversion from fp16.
3422 // If the sources's op_sel is set, it picks the high half of the source
3423 // register.
3424
3425 Mods |= SISrcMods::OP_SEL_1;
3426 if (isExtractHiElt(Src, Src)) {
3427 Mods |= SISrcMods::OP_SEL_0;
3428
3429 // TODO: Should we try to look for neg/abs here?
3430 }
3431
3432 return true;
3433 }
3434
3435 return false;
3436}
3437
3438bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsExt(SDValue In, SDValue &Src,
3439 SDValue &SrcMods) const {
3440 unsigned Mods = 0;
3441 if (!SelectVOP3PMadMixModsImpl(In, Src, Mods))
3442 return false;
3443 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3444 return true;
3445}
3446
3447bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src,
3448 SDValue &SrcMods) const {
3449 unsigned Mods = 0;
3450 SelectVOP3PMadMixModsImpl(In, Src, Mods);
3451 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3452 return true;
3453}
3454
3455SDValue AMDGPUDAGToDAGISel::getHi16Elt(SDValue In) const {
3456 if (In.isUndef())
3457 return CurDAG->getUNDEF(MVT::i32);
3458
3459 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(In)) {
3460 SDLoc SL(In);
3461 return CurDAG->getConstant(C->getZExtValue() << 16, SL, MVT::i32);
3462 }
3463
3464 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(In)) {
3465 SDLoc SL(In);
3466 return CurDAG->getConstant(
3467 C->getValueAPF().bitcastToAPInt().getZExtValue() << 16, SL, MVT::i32);
3468 }
3469
3470 SDValue Src;
3471 if (isExtractHiElt(In, Src))
3472 return Src;
3473
3474 return SDValue();
3475}
3476
3477bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
3479
3480 const SIRegisterInfo *SIRI =
3481 static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
3482 const SIInstrInfo * SII =
3483 static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
3484
3485 unsigned Limit = 0;
3486 bool AllUsesAcceptSReg = true;
3487 for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
3488 Limit < 10 && U != E; ++U, ++Limit) {
3489 const TargetRegisterClass *RC = getOperandRegClass(*U, U.getOperandNo());
3490
3491 // If the register class is unknown, it could be an unknown
3492 // register class that needs to be an SGPR, e.g. an inline asm
3493 // constraint
3494 if (!RC || SIRI->isSGPRClass(RC))
3495 return false;
3496
3497 if (RC != &AMDGPU::VS_32RegClass && RC != &AMDGPU::VS_64RegClass) {
3498 AllUsesAcceptSReg = false;
3499 SDNode * User = *U;
3500 if (User->isMachineOpcode()) {
3501 unsigned Opc = User->getMachineOpcode();
3502 const MCInstrDesc &Desc = SII->get(Opc);
3503 if (Desc.isCommutable()) {
3504 unsigned OpIdx = Desc.getNumDefs() + U.getOperandNo();
3505 unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex;
3506 if (SII->findCommutedOpIndices(Desc, OpIdx, CommuteIdx1)) {
3507 unsigned CommutedOpNo = CommuteIdx1 - Desc.getNumDefs();
3508 const TargetRegisterClass *CommutedRC = getOperandRegClass(*U, CommutedOpNo);
3509 if (CommutedRC == &AMDGPU::VS_32RegClass ||
3510 CommutedRC == &AMDGPU::VS_64RegClass)
3511 AllUsesAcceptSReg = true;
3512 }
3513 }
3514 }
3515 // If "AllUsesAcceptSReg == false" so far we haven't succeeded
3516 // commuting current user. This means have at least one use
3517 // that strictly require VGPR. Thus, we will not attempt to commute
3518 // other user instructions.
3519 if (!AllUsesAcceptSReg)
3520 break;
3521 }
3522 }
3523 return !AllUsesAcceptSReg && (Limit < 10);
3524}
3525
3526bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode *N) const {
3527 auto Ld = cast<LoadSDNode>(N);
3528
3529 const MachineMemOperand *MMO = Ld->getMemOperand();
3530 if (N->isDivergent() && !AMDGPUInstrInfo::isUniformMMO(MMO))
3531 return false;
3532
3533 return Ld->getAlign() >= Align(std::min(MMO->getSize(), uint64_t(4))) &&
3534 ((Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
3535 Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ||
3536 (Subtarget->getScalarizeGlobalBehavior() &&
3537 Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
3538 Ld->isSimple() &&
3539 static_cast<const SITargetLowering *>(getTargetLowering())
3540 ->isMemOpHasNoClobberedMemOperand(N)));
3541}
3542
3545 *static_cast<const AMDGPUTargetLowering*>(getTargetLowering());
3546 bool IsModified = false;
3547 do {
3548 IsModified = false;
3549
3550 // Go over all selected nodes and try to fold them a bit more
3552 while (Position != CurDAG->allnodes_end()) {
3553 SDNode *Node = &*Position++;
3554 MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(Node);
3555 if (!MachineNode)
3556 continue;
3557
3558 SDNode *ResNode = Lowering.PostISelFolding(MachineNode, *CurDAG);
3559 if (ResNode != Node) {
3560 if (ResNode)
3561 ReplaceUses(Node, ResNode);
3562 IsModified = true;
3563 }
3564 }
3566 } while (IsModified);
3567}
3568
3569char AMDGPUDAGToDAGISel::ID = 0;
unsigned const MachineRegisterInfo * MRI
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool getBaseWithOffsetUsingSplitOR(SelectionDAG &DAG, SDValue Addr, SDValue &N0, SDValue &N1)
static MachineSDNode * buildRegSequence32(SmallVectorImpl< SDValue > &Elts, llvm::SelectionDAG *CurDAG, const SDLoc &DL)
static SDValue matchZExtFromI32(SDValue Op)
static unsigned gwsIntrinToOpcode(unsigned IntrID)
static MachineSDNode * buildRegSequence(SmallVectorImpl< SDValue > &Elts, llvm::SelectionDAG *CurDAG, const SDLoc &DL, unsigned ElementSize)
static SDValue SelectSAddrFI(SelectionDAG *CurDAG, SDValue SAddr)
static MemSDNode * findMemSDNode(SDNode *N)
static bool isNoUnsignedWrap(SDValue Addr)
static MachineSDNode * buildRegSequence16(SmallVectorImpl< SDValue > &Elts, llvm::SelectionDAG *CurDAG, const SDLoc &DL)
static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods, SmallVectorImpl< SDValue > &Elts, SDValue &Src, llvm::SelectionDAG *CurDAG, const SDLoc &DL, unsigned ElementSize)
static bool IsCopyFromSGPR(const SIRegisterInfo &TRI, SDValue Val)
static SDValue combineBallotPattern(SDValue VCMP, bool &Negate)
static void checkWMMAElementsModifiersF16(BuildVectorSDNode *BV, std::function< bool(SDValue)> ModifierCheck)
Defines an instruction selector for the AMDGPU target.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In, Register &Out)
static unsigned gwsIntrinToOpcode(unsigned IntrID)
static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods, SmallVectorImpl< Register > &Elts, Register &Src, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
Provides AMDGPU specific target descriptions.
Base class for AMDGPU specific classes of TargetSubtarget.
The AMDGPU TargetMachine interface definition for hw codegen targets.
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Addr
uint64_t Size
unsigned const TargetRegisterInfo * TRI
if(VerifyEach)
const char LLVMTargetMachineRef TM
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:59
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
pre isel intrinsic Pre ISel Intrinsic Lowering
Provides R600 specific target descriptions.
Interface definition for R600RegisterInfo.
const SmallVectorImpl< MachineOperand > & Cond
SI DAG Lowering interface definition.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
LLVM IR instance of the generic uniformity analysis.
Value * RHS
Value * LHS
support::ulittle16_t & Lo
Definition: aarch32.cpp:206
support::ulittle16_t & Hi
Definition: aarch32.cpp:205
AMDGPU specific code to select AMDGPU machine instructions for SelectionDAG operations.
void SelectBuildVector(SDNode *N, unsigned RegClassID)
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
void Select(SDNode *N) override
Main hook for targets to transform nodes into machine nodes.
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
void PreprocessISelDAG() override
PreprocessISelDAG - This hook allows targets to hack on the graph before instruction selection starts...
void PostprocessISelDAG() override
PostprocessISelDAG() - This hook allows the target to hack on the graph right after selection.
StringRef getPassName() const override
getPassName - Return a nice clean name for a pass.
AMDGPUDAGToDAGISel()=delete
bool matchLoadD16FromBuildVector(SDNode *N) const
static bool isUniformMMO(const MachineMemOperand *MMO)
unsigned getWavefrontSizeLog2() const
bool hasInv2PiInlineImm() const
static SDValue stripBitcast(SDValue Val)
static int64_t getNullPointerValue(unsigned AddrSpace)
Get the integer value of a null pointer in the given address space.
Class for arbitrary precision integers.
Definition: APInt.h:76
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:212
APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:981
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1485
APInt shl(unsigned shiftAmt) const
Left-shift function.
Definition: APInt.h:851
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1507
unsigned countr_one() const
Count the number of trailing one bits.
Definition: APInt.h:1600
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
LLVM Basic Block Representation.
Definition: BasicBlock.h:60
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:220
A "pseudo-class" with methods for operating on BUILD_VECTORs.
SDValue getSplatValue(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted value or a null value if this is not a splat.
uint64_t getZExtValue() const
int64_t getSExtValue() const
This class represents an Operation in the Expression.
Legacy analysis pass which computes a DominatorTree.
Definition: Dominators.h:313
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:311
int getLDSBankCount() const
Definition: GCNSubtarget.h:324
bool hasUsableDSOffset() const
True if the offset field of DS instructions works as expected.
Definition: GCNSubtarget.h:452
bool unsafeDSOffsetFoldingEnabled() const
Definition: GCNSubtarget.h:456
bool hasFlatInstOffsets() const
Definition: GCNSubtarget.h:609
bool hasDLInsts() const
Definition: GCNSubtarget.h:742
const SIInstrInfo * getInstrInfo() const override
Definition: GCNSubtarget.h:250
unsigned getConstantBusLimit(unsigned Opcode) const
bool hasMADIntraFwdBug() const
bool privateMemoryResourceIsRangeChecked() const
Definition: GCNSubtarget.h:537
bool hasSignedScratchOffsets() const
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:262
bool hasDOTOpSelHazard() const
bool d16PreservesUnusedBits() const
Definition: GCNSubtarget.h:669
bool hasRestrictedSOffset() const
bool hasFlatSegmentOffsetBug() const
Definition: GCNSubtarget.h:657
bool getScalarizeGlobalBehavior() const
Definition: GCNSubtarget.h:900
bool ldsRequiresM0Init() const
Return if most LDS instructions have an m0 use that require m0 to be initialized.
Definition: GCNSubtarget.h:679
bool hasFlatScratchSVSSwizzleBug() const
bool hasGWS() const
bool useFlatForGlobal() const
Definition: GCNSubtarget.h:515
Generation getGeneration() const
Definition: GCNSubtarget.h:301
bool hasGWSSemaReleaseAll() const
Definition: GCNSubtarget.h:693
bool hasAddr64() const
Definition: GCNSubtarget.h:365
bool hasAddNoCarry() const
Definition: GCNSubtarget.h:701
bool hasSALUFloatInsts() const
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
SmallVector< LoopT *, 4 > getLoopsInPreorder() const
Return all of the loops in the function in preorder across the loop nests, with siblings in forward p...
The legacy pass manager's analysis pass to compute loop information.
Definition: LoopInfo.h:593
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
const Triple & getTargetTriple() const
Machine Value Type.
static MVT getIntegerVT(unsigned BitWidth)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
A description of a memory reference used in the backend.
uint64_t getSize() const
Return the size in bytes of the memory reference.
Align getAlign() const
Return the minimum known alignment in bytes of the actual memory reference.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
An SDNode that represents everything that will be needed to construct a MachineInstr.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getChain() const
EVT getMemoryVT() const
Return the type of the in-memory value.
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
This class provides iterator support for SDUse operands that use a specific SDNode.
Represents one node in the SelectionDAG.
const APInt & getAsAPIntVal() const
Helper method returns the APInt value of a ConstantSDNode.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool isDivergent() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumOperands() const
Return the number of values used by this operation.
const SDValue & getOperand(unsigned Num) const
bool isPredecessorOf(const SDNode *N) const
Return true if this node is a predecessor of N.
static use_iterator use_end()
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
const SDValue & getOperand(unsigned i) const
uint64_t getConstantOperandVal(unsigned i) const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx0, unsigned &SrcOpIdx1) const override
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
const TargetRegisterClass * getRegClass(unsigned RCID) const
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isSGPRClass(const TargetRegisterClass *RC)
SelectionDAGISel - This is the common base class used for SelectionDAG-based pattern-matching instruc...
std::unique_ptr< FunctionLoweringInfo > FuncInfo
const TargetLowering * TLI
MachineFunction * MF
const TargetInstrInfo * TII
void ReplaceUses(SDValue F, SDValue T)
ReplaceUses - replace all uses of the old node F with the use of the new node T.
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
void ReplaceNode(SDNode *F, SDNode *T)
Replace all uses of F with T, then remove F from the DAG.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const TargetLowering * getTargetLowering() const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:225
const TargetSubtargetInfo & getSubtarget() const
Definition: SelectionDAG.h:474
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
SDNode * SelectNodeTo(SDNode *N, unsigned MachineOpc, EVT VT)
These are used for target selectors to mutate the specified node to have the specified return type,...
SDNode * MorphNodeTo(SDNode *N, unsigned Opc, SDVTList VTs, ArrayRef< SDValue > Ops)
This mutates the specified node to have the specified return type, opcode, and operands.
allnodes_const_iterator allnodes_begin() const
Definition: SelectionDAG.h:531
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
allnodes_const_iterator allnodes_end() const
Definition: SelectionDAG.h:532
void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
SDValue getTargetFrameIndex(int FI, EVT VT)
Definition: SelectionDAG.h:727
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
bool SignBitIsZero(SDValue Op, unsigned Depth=0) const
Return true if the sign bit of Op is known to be zero.
SDValue getRegister(unsigned Reg, EVT VT)
void RemoveDeadNodes()
This method deletes all unreachable nodes in the SelectionDAG.
void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
const TargetMachine & getTarget() const
Definition: SelectionDAG.h:473
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, unsigned Reg, SDValue N)
Definition: SelectionDAG.h:773
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, uint64_t Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:676
bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:469
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition: SelectionDAG.h:554
ilist< SDNode >::iterator allnodes_iterator
Definition: SelectionDAG.h:534
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
static const unsigned CommuteAnyOperandIndex
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:76
const Triple & getTargetTriple() const
unsigned getID() const
Return the register class ID number.
ArchType getArch() const
Get the parsed architecture type of this triple.
Definition: Triple.h:361
Legacy analysis pass which computes a CycleInfo.
LLVM Value Representation.
Definition: Value.h:74
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
@ CLAMP
CLAMP value between 0.0 and 1.0.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
Key
PAL metadata keys.
std::optional< int64_t > getSMRDEncodedLiteralOffset32(const MCSubtargetInfo &ST, int64_t ByteOffset)
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isValid32BitLiteral(uint64_t Val, bool IsFP64)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
std::optional< int64_t > getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset, bool IsBuffer)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:750
@ STACKRESTORE
STACKRESTORE has two operands, an input chain and a pointer to restore to it returns an output chain.
Definition: ISDOpcodes.h:1124
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:250
@ ConstantFP
Definition: ISDOpcodes.h:77
@ ADDC
Carry-setting nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:269
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition: ISDOpcodes.h:487
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:239
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1029
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:483
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition: ISDOpcodes.h:199
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:790
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:390
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:903
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition: ISDOpcodes.h:229
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
Definition: ISDOpcodes.h:937
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition: ISDOpcodes.h:620
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:928
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition: ISDOpcodes.h:500
@ UNDEF
UNDEF - An undefined node.
Definition: ISDOpcodes.h:211
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition: ISDOpcodes.h:208
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:705
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:535
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:203
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:780
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:969
@ TargetFrameIndex
Definition: ISDOpcodes.h:166
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:798
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:888
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:303
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:680
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:184
@ ADDE
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:279
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:869
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:786
@ BRCOND
BRCOND - Conditional branch.
Definition: ISDOpcodes.h:1068
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:192
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition: ISDOpcodes.h:515
bool isExtOpcode(unsigned Opcode)
Definition: ISDOpcodes.h:1583
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1512
@ Undef
Value of the register doesn't matter.
Reg
All possible values of the reg field in the ModR/M byte.
constexpr const char32_t SBase
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Offset
Definition: DWP.cpp:456
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
constexpr bool isMask_32(uint32_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition: MathExtras.h:240
bool isBoolSGPR(SDValue V)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:136
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
CodeGenOptLevel
Code generation optimization level.
Definition: CodeGen.h:54
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:141
FunctionPass * createAMDGPUISelDag(TargetMachine &TM, CodeGenOptLevel OptLevel)
This pass converts a legalized DAG into a AMDGPU-specific.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ Add
Sum of integers.
unsigned M0(unsigned Val)
Definition: VE.h:375
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Description of the encoding of one expression Op.
Extended Value Type.
Definition: ValueTypes.h:34
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:351
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:363
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition: ValueTypes.h:239
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:311
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:149
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:319
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition: KnownBits.h:292
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition: KnownBits.h:136
static KnownBits computeForAddSub(bool Add, bool NSW, const KnownBits &LHS, KnownBits RHS)
Compute known bits resulting from adding LHS and RHS.
Definition: KnownBits.cpp:57
static unsigned getSubRegFromChannel(unsigned Channel)
This represents a list of ValueType's that has been intern'd by a SelectionDAG.