LLVM 23.0.0git
AMDGPUISelDAGToDAG.cpp
Go to the documentation of this file.
1//===-- AMDGPUISelDAGToDAG.cpp - A dag to dag inst selector for AMDGPU ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//==-----------------------------------------------------------------------===//
8//
9/// \file
10/// Defines an instruction selector for the AMDGPU target.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPUISelDAGToDAG.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPUSubtarget.h"
18#include "AMDGPUTargetMachine.h"
21#include "R600RegisterInfo.h"
22#include "SIISelLowering.h"
29#include "llvm/IR/IntrinsicsAMDGPU.h"
32
33#ifdef EXPENSIVE_CHECKS
35#include "llvm/IR/Dominators.h"
36#endif
37
38#define DEBUG_TYPE "amdgpu-isel"
39
40using namespace llvm;
41
42//===----------------------------------------------------------------------===//
43// Instruction Selector Implementation
44//===----------------------------------------------------------------------===//
45
46namespace {
47static SDValue stripBitcast(SDValue Val) {
48 return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val;
49}
50
51// Figure out if this is really an extract of the high 16-bits of a dword.
52static bool isExtractHiElt(SDValue In, SDValue &Out) {
53 In = stripBitcast(In);
54
55 if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
56 if (ConstantSDNode *Idx = dyn_cast<ConstantSDNode>(In.getOperand(1))) {
57 if (!Idx->isOne())
58 return false;
59 Out = In.getOperand(0);
60 return true;
61 }
62 }
63
64 if (In.getOpcode() != ISD::TRUNCATE)
65 return false;
66
67 SDValue Srl = In.getOperand(0);
68 if (Srl.getOpcode() == ISD::SRL) {
69 if (ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
70 if (ShiftAmt->getZExtValue() == 16) {
71 Out = stripBitcast(Srl.getOperand(0));
72 return true;
73 }
74 }
75 }
76
77 return false;
78}
79
80static SDValue createVOP3PSrc32FromLo16(SDValue Lo, SDValue Src,
81 llvm::SelectionDAG *CurDAG,
82 const GCNSubtarget *Subtarget) {
83 if (!Subtarget->useRealTrue16Insts()) {
84 return Lo;
85 }
86
87 SDValue NewSrc;
88 SDLoc SL(Lo);
89
90 if (Lo->isDivergent()) {
91 SDValue Undef = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
92 SL, Lo.getValueType()),
93 0);
94 const SDValue Ops[] = {
95 CurDAG->getTargetConstant(AMDGPU::VGPR_32RegClassID, SL, MVT::i32), Lo,
96 CurDAG->getTargetConstant(AMDGPU::lo16, SL, MVT::i16), Undef,
97 CurDAG->getTargetConstant(AMDGPU::hi16, SL, MVT::i16)};
98
99 NewSrc = SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, SL,
100 Src.getValueType(), Ops),
101 0);
102 } else {
103 // the S_MOV is needed since the Lo could still be a VGPR16.
104 // With S_MOV, isel insert a "sgpr32 = copy vgpr16" and we reply on
105 // the fixvgpr2sgprcopy pass to legalize it
106 NewSrc = SDValue(
107 CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, Src.getValueType(), Lo),
108 0);
109 }
110
111 return NewSrc;
112}
113
114// Look through operations that obscure just looking at the low 16-bits of the
115// same register.
116static SDValue stripExtractLoElt(SDValue In) {
117 if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
118 SDValue Idx = In.getOperand(1);
119 if (isNullConstant(Idx) && In.getValueSizeInBits() <= 32)
120 return In.getOperand(0);
121 }
122
123 if (In.getOpcode() == ISD::TRUNCATE) {
124 SDValue Src = In.getOperand(0);
125 if (Src.getValueType().getSizeInBits() == 32)
126 return stripBitcast(Src);
127 }
128
129 return In;
130}
131
132} // end anonymous namespace
133
135 "AMDGPU DAG->DAG Pattern Instruction Selection", false,
136 false)
137INITIALIZE_PASS_DEPENDENCY(AMDGPUPerfHintAnalysisLegacy)
139#ifdef EXPENSIVE_CHECKS
142#endif
144 "AMDGPU DAG->DAG Pattern Instruction Selection", false,
145 false)
146
147/// This pass converts a legalized DAG into a AMDGPU-specific
148// DAG, ready for instruction scheduling.
150 CodeGenOptLevel OptLevel) {
151 return new AMDGPUDAGToDAGISelLegacy(TM, OptLevel);
152}
153
157
159 Subtarget = &MF.getSubtarget<GCNSubtarget>();
160 Subtarget->checkSubtargetFeatures(MF.getFunction());
161 Mode = SIModeRegisterDefaults(MF.getFunction(), *Subtarget);
163}
164
165bool AMDGPUDAGToDAGISel::fp16SrcZerosHighBits(unsigned Opc) const {
166 // XXX - only need to list legal operations.
167 switch (Opc) {
168 case ISD::FADD:
169 case ISD::FSUB:
170 case ISD::FMUL:
171 case ISD::FDIV:
172 case ISD::FREM:
174 case ISD::UINT_TO_FP:
175 case ISD::SINT_TO_FP:
176 case ISD::FABS:
177 // Fabs is lowered to a bit operation, but it's an and which will clear the
178 // high bits anyway.
179 case ISD::FSQRT:
180 case ISD::FSIN:
181 case ISD::FCOS:
182 case ISD::FPOWI:
183 case ISD::FPOW:
184 case ISD::FLOG:
185 case ISD::FLOG2:
186 case ISD::FLOG10:
187 case ISD::FEXP:
188 case ISD::FEXP2:
189 case ISD::FCEIL:
190 case ISD::FTRUNC:
191 case ISD::FRINT:
192 case ISD::FNEARBYINT:
193 case ISD::FROUNDEVEN:
194 case ISD::FROUND:
195 case ISD::FFLOOR:
196 case ISD::FMINNUM:
197 case ISD::FMAXNUM:
198 case ISD::FLDEXP:
199 case AMDGPUISD::FRACT:
200 case AMDGPUISD::CLAMP:
201 case AMDGPUISD::COS_HW:
202 case AMDGPUISD::SIN_HW:
203 case AMDGPUISD::FMIN3:
204 case AMDGPUISD::FMAX3:
205 case AMDGPUISD::FMED3:
206 case AMDGPUISD::FMAD_FTZ:
207 case AMDGPUISD::RCP:
208 case AMDGPUISD::RSQ:
209 case AMDGPUISD::RCP_IFLAG:
210 // On gfx10, all 16-bit instructions preserve the high bits.
211 return Subtarget->getGeneration() <= AMDGPUSubtarget::GFX9;
212 case ISD::FP_ROUND:
213 // We may select fptrunc (fma/mad) to mad_mixlo, which does not zero the
214 // high bits on gfx9.
215 // TODO: If we had the source node we could see if the source was fma/mad
217 case ISD::FMA:
218 case ISD::FMAD:
219 case AMDGPUISD::DIV_FIXUP:
221 default:
222 // fcopysign, select and others may be lowered to 32-bit bit operations
223 // which don't zero the high bits.
224 return false;
225 }
226}
227
229#ifdef EXPENSIVE_CHECKS
231 LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
232 for (auto &L : LI->getLoopsInPreorder()) {
233 assert(L->isLCSSAForm(DT));
234 }
235#endif
237}
238
247
249 assert(Subtarget->d16PreservesUnusedBits());
250 MVT VT = N->getValueType(0).getSimpleVT();
251 if (VT != MVT::v2i16 && VT != MVT::v2f16)
252 return false;
253
254 SDValue Lo = N->getOperand(0);
255 SDValue Hi = N->getOperand(1);
256
257 LoadSDNode *LdHi = dyn_cast<LoadSDNode>(stripBitcast(Hi));
258
259 // build_vector lo, (load ptr) -> load_d16_hi ptr, lo
260 // build_vector lo, (zextload ptr from i8) -> load_d16_hi_u8 ptr, lo
261 // build_vector lo, (sextload ptr from i8) -> load_d16_hi_i8 ptr, lo
262
263 // Need to check for possible indirect dependencies on the other half of the
264 // vector to avoid introducing a cycle.
265 if (LdHi && Hi.hasOneUse() && !LdHi->isPredecessorOf(Lo.getNode())) {
266 SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
267
268 SDValue TiedIn = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Lo);
269 SDValue Ops[] = {
270 LdHi->getChain(), LdHi->getBasePtr(), TiedIn
271 };
272
273 unsigned LoadOp = AMDGPUISD::LOAD_D16_HI;
274 if (LdHi->getMemoryVT() == MVT::i8) {
275 LoadOp = LdHi->getExtensionType() == ISD::SEXTLOAD ?
276 AMDGPUISD::LOAD_D16_HI_I8 : AMDGPUISD::LOAD_D16_HI_U8;
277 } else {
278 assert(LdHi->getMemoryVT() == MVT::i16);
279 }
280
281 SDValue NewLoadHi =
282 CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdHi), VTList,
283 Ops, LdHi->getMemoryVT(),
284 LdHi->getMemOperand());
285
286 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadHi);
287 CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdHi, 1), NewLoadHi.getValue(1));
288 return true;
289 }
290
291 // build_vector (load ptr), hi -> load_d16_lo ptr, hi
292 // build_vector (zextload ptr from i8), hi -> load_d16_lo_u8 ptr, hi
293 // build_vector (sextload ptr from i8), hi -> load_d16_lo_i8 ptr, hi
294 LoadSDNode *LdLo = dyn_cast<LoadSDNode>(stripBitcast(Lo));
295 if (LdLo && Lo.hasOneUse()) {
296 SDValue TiedIn = getHi16Elt(Hi);
297 if (!TiedIn || LdLo->isPredecessorOf(TiedIn.getNode()))
298 return false;
299
300 SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
301 unsigned LoadOp = AMDGPUISD::LOAD_D16_LO;
302 if (LdLo->getMemoryVT() == MVT::i8) {
303 LoadOp = LdLo->getExtensionType() == ISD::SEXTLOAD ?
304 AMDGPUISD::LOAD_D16_LO_I8 : AMDGPUISD::LOAD_D16_LO_U8;
305 } else {
306 assert(LdLo->getMemoryVT() == MVT::i16);
307 }
308
309 TiedIn = CurDAG->getNode(ISD::BITCAST, SDLoc(N), VT, TiedIn);
310
311 SDValue Ops[] = {
312 LdLo->getChain(), LdLo->getBasePtr(), TiedIn
313 };
314
315 SDValue NewLoadLo =
316 CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdLo), VTList,
317 Ops, LdLo->getMemoryVT(),
318 LdLo->getMemOperand());
319
320 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadLo);
321 CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdLo, 1), NewLoadLo.getValue(1));
322 return true;
323 }
324
325 return false;
326}
327
329 if (!Subtarget->d16PreservesUnusedBits())
330 return;
331
332 SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
333
334 bool MadeChange = false;
335 while (Position != CurDAG->allnodes_begin()) {
336 SDNode *N = &*--Position;
337 if (N->use_empty())
338 continue;
339
340 switch (N->getOpcode()) {
342 // TODO: Match load d16 from shl (extload:i16), 16
343 MadeChange |= matchLoadD16FromBuildVector(N);
344 break;
345 default:
346 break;
347 }
348 }
349
350 if (MadeChange) {
351 CurDAG->RemoveDeadNodes();
352 LLVM_DEBUG(dbgs() << "After PreProcess:\n";
353 CurDAG->dump(););
354 }
355}
356
357bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N) const {
358 if (N->isUndef())
359 return true;
360
361 const SIInstrInfo *TII = Subtarget->getInstrInfo();
363 return TII->isInlineConstant(C->getAPIntValue());
364
366 return TII->isInlineConstant(C->getValueAPF());
367
368 return false;
369}
370
371/// Determine the register class for \p OpNo
372/// \returns The register class of the virtual register that will be used for
373/// the given operand number \OpNo or NULL if the register class cannot be
374/// determined.
375const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
376 unsigned OpNo) const {
377 if (!N->isMachineOpcode()) {
378 if (N->getOpcode() == ISD::CopyToReg) {
379 Register Reg = cast<RegisterSDNode>(N->getOperand(1))->getReg();
380 if (Reg.isVirtual()) {
382 return MRI.getRegClass(Reg);
383 }
384
385 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
386 return TRI->getPhysRegBaseClass(Reg);
387 }
388
389 return nullptr;
390 }
391
392 switch (N->getMachineOpcode()) {
393 default: {
394 const SIInstrInfo *TII = Subtarget->getInstrInfo();
395 const MCInstrDesc &Desc = TII->get(N->getMachineOpcode());
396 unsigned OpIdx = Desc.getNumDefs() + OpNo;
397 if (OpIdx >= Desc.getNumOperands())
398 return nullptr;
399
400 int16_t RegClass = TII->getOpRegClassID(Desc.operands()[OpIdx]);
401 if (RegClass == -1)
402 return nullptr;
403
404 return Subtarget->getRegisterInfo()->getRegClass(RegClass);
405 }
406 case AMDGPU::REG_SEQUENCE: {
407 unsigned RCID = N->getConstantOperandVal(0);
408 const TargetRegisterClass *SuperRC =
409 Subtarget->getRegisterInfo()->getRegClass(RCID);
410
411 SDValue SubRegOp = N->getOperand(OpNo + 1);
412 unsigned SubRegIdx = SubRegOp->getAsZExtVal();
413 return Subtarget->getRegisterInfo()->getSubClassWithSubReg(SuperRC,
414 SubRegIdx);
415 }
416 }
417}
418
419SDNode *AMDGPUDAGToDAGISel::glueCopyToOp(SDNode *N, SDValue NewChain,
420 SDValue Glue) const {
422 Ops.push_back(NewChain); // Replace the chain.
423 for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
424 Ops.push_back(N->getOperand(i));
425
426 Ops.push_back(Glue);
427 return CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops);
428}
429
430SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N, SDValue Val) const {
431 const SITargetLowering& Lowering =
432 *static_cast<const SITargetLowering*>(getTargetLowering());
433
434 assert(N->getOperand(0).getValueType() == MVT::Other && "Expected chain");
435
436 SDValue M0 = Lowering.copyToM0(*CurDAG, N->getOperand(0), SDLoc(N), Val);
437 return glueCopyToOp(N, M0, M0.getValue(1));
438}
439
440SDNode *AMDGPUDAGToDAGISel::glueCopyToM0LDSInit(SDNode *N) const {
441 unsigned AS = cast<MemSDNode>(N)->getAddressSpace();
442 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
443 if (Subtarget->ldsRequiresM0Init())
444 return glueCopyToM0(
445 N, CurDAG->getSignedTargetConstant(-1, SDLoc(N), MVT::i32));
446 } else if (AS == AMDGPUAS::REGION_ADDRESS) {
447 MachineFunction &MF = CurDAG->getMachineFunction();
448 unsigned Value = MF.getInfo<SIMachineFunctionInfo>()->getGDSSize();
449 return
450 glueCopyToM0(N, CurDAG->getTargetConstant(Value, SDLoc(N), MVT::i32));
451 }
452 return N;
453}
454
455MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm,
456 EVT VT) const {
457 SDNode *Lo = CurDAG->getMachineNode(
458 AMDGPU::S_MOV_B32, DL, MVT::i32,
459 CurDAG->getTargetConstant(Lo_32(Imm), DL, MVT::i32));
460 SDNode *Hi = CurDAG->getMachineNode(
461 AMDGPU::S_MOV_B32, DL, MVT::i32,
462 CurDAG->getTargetConstant(Hi_32(Imm), DL, MVT::i32));
463 const SDValue Ops[] = {
464 CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
465 SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
466 SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
467
468 return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, VT, Ops);
469}
470
471SDNode *AMDGPUDAGToDAGISel::packConstantV2I16(const SDNode *N,
472 SelectionDAG &DAG) const {
473 // TODO: Handle undef as zero
474
475 assert(N->getOpcode() == ISD::BUILD_VECTOR && N->getNumOperands() == 2);
476 uint32_t LHSVal, RHSVal;
477 if (getConstantValue(N->getOperand(0), LHSVal) &&
478 getConstantValue(N->getOperand(1), RHSVal)) {
479 SDLoc SL(N);
480 uint32_t K = (LHSVal & 0xffff) | (RHSVal << 16);
481 return DAG.getMachineNode(
482 isVGPRImm(N) ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32, SL,
483 N->getValueType(0), DAG.getTargetConstant(K, SL, MVT::i32));
484 }
485
486 return nullptr;
487}
488
489void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
490 EVT VT = N->getValueType(0);
491 unsigned NumVectorElts = VT.getVectorNumElements();
492 EVT EltVT = VT.getVectorElementType();
493 SDLoc DL(N);
494 SDValue RegClass = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
495
496 if (NumVectorElts == 1) {
497 CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT, N->getOperand(0),
498 RegClass);
499 return;
500 }
501
502 bool IsGCN = CurDAG->getSubtarget().getTargetTriple().isAMDGCN();
503 if (IsGCN && Subtarget->has64BitLiterals() && VT.getSizeInBits() == 64 &&
504 CurDAG->isConstantValueOfAnyType(SDValue(N, 0))) {
505 uint64_t C = 0;
506 bool AllConst = true;
507 unsigned EltSize = EltVT.getSizeInBits();
508 for (unsigned I = 0; I < NumVectorElts; ++I) {
509 SDValue Op = N->getOperand(I);
510 if (Op.isUndef()) {
511 AllConst = false;
512 break;
513 }
514 uint64_t Val;
516 Val = CF->getValueAPF().bitcastToAPInt().getZExtValue();
517 } else
518 Val = cast<ConstantSDNode>(Op)->getZExtValue();
519 C |= Val << (EltSize * I);
520 }
521 if (AllConst) {
522 SDValue CV = CurDAG->getTargetConstant(C, DL, MVT::i64);
523 MachineSDNode *Copy =
524 CurDAG->getMachineNode(AMDGPU::S_MOV_B64_IMM_PSEUDO, DL, VT, CV);
525 CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, VT, SDValue(Copy, 0),
526 RegClass);
527 return;
528 }
529 }
530
531 assert(NumVectorElts <= 32 && "Vectors with more than 32 elements not "
532 "supported yet");
533 // 32 = Max Num Vector Elements
534 // 2 = 2 REG_SEQUENCE operands per element (value, subreg index)
535 // 1 = Vector Register Class
536 SmallVector<SDValue, 32 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1);
537
538 RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
539 bool IsRegSeq = true;
540 unsigned NOps = N->getNumOperands();
541 for (unsigned i = 0; i < NOps; i++) {
542 // XXX: Why is this here?
543 if (isa<RegisterSDNode>(N->getOperand(i))) {
544 IsRegSeq = false;
545 break;
546 }
547 unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
549 RegSeqArgs[1 + (2 * i)] = N->getOperand(i);
550 RegSeqArgs[1 + (2 * i) + 1] = CurDAG->getTargetConstant(Sub, DL, MVT::i32);
551 }
552 if (NOps != NumVectorElts) {
553 // Fill in the missing undef elements if this was a scalar_to_vector.
554 assert(N->getOpcode() == ISD::SCALAR_TO_VECTOR && NOps < NumVectorElts);
555 MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
556 DL, EltVT);
557 for (unsigned i = NOps; i < NumVectorElts; ++i) {
558 unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
560 RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0);
561 RegSeqArgs[1 + (2 * i) + 1] =
562 CurDAG->getTargetConstant(Sub, DL, MVT::i32);
563 }
564 }
565
566 if (!IsRegSeq)
567 SelectCode(N);
568 CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), RegSeqArgs);
569}
570
572 EVT VT = N->getValueType(0);
573 EVT EltVT = VT.getVectorElementType();
574
575 // TODO: Handle 16-bit element vectors with even aligned masks.
576 if (!Subtarget->hasPkMovB32() || !EltVT.bitsEq(MVT::i32) ||
577 VT.getVectorNumElements() != 2) {
578 SelectCode(N);
579 return;
580 }
581
582 auto *SVN = cast<ShuffleVectorSDNode>(N);
583
584 SDValue Src0 = SVN->getOperand(0);
585 SDValue Src1 = SVN->getOperand(1);
586 ArrayRef<int> Mask = SVN->getMask();
587 SDLoc DL(N);
588
589 assert(Src0.getValueType().getVectorNumElements() == 2 && Mask.size() == 2 &&
590 Mask[0] < 4 && Mask[1] < 4);
591
592 SDValue VSrc0 = Mask[0] < 2 ? Src0 : Src1;
593 SDValue VSrc1 = Mask[1] < 2 ? Src0 : Src1;
594 unsigned Src0SubReg = Mask[0] & 1 ? AMDGPU::sub1 : AMDGPU::sub0;
595 unsigned Src1SubReg = Mask[1] & 1 ? AMDGPU::sub1 : AMDGPU::sub0;
596
597 if (Mask[0] < 0) {
598 Src0SubReg = Src1SubReg;
599 MachineSDNode *ImpDef =
600 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, VT);
601 VSrc0 = SDValue(ImpDef, 0);
602 }
603
604 if (Mask[1] < 0) {
605 Src1SubReg = Src0SubReg;
606 MachineSDNode *ImpDef =
607 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, VT);
608 VSrc1 = SDValue(ImpDef, 0);
609 }
610
611 // SGPR case needs to lower to copies.
612 //
613 // Also use subregister extract when we can directly blend the registers with
614 // a simple subregister copy.
615 //
616 // TODO: Maybe we should fold this out earlier
617 if (N->isDivergent() && Src0SubReg == AMDGPU::sub1 &&
618 Src1SubReg == AMDGPU::sub0) {
619 // The low element of the result always comes from src0.
620 // The high element of the result always comes from src1.
621 // op_sel selects the high half of src0.
622 // op_sel_hi selects the high half of src1.
623
624 unsigned Src0OpSel =
625 Src0SubReg == AMDGPU::sub1 ? SISrcMods::OP_SEL_0 : SISrcMods::NONE;
626 unsigned Src1OpSel =
627 Src1SubReg == AMDGPU::sub1 ? SISrcMods::OP_SEL_0 : SISrcMods::NONE;
628
629 // Enable op_sel_hi to avoid printing it. This should have no effect on the
630 // result.
631 Src0OpSel |= SISrcMods::OP_SEL_1;
632 Src1OpSel |= SISrcMods::OP_SEL_1;
633
634 SDValue Src0OpSelVal = CurDAG->getTargetConstant(Src0OpSel, DL, MVT::i32);
635 SDValue Src1OpSelVal = CurDAG->getTargetConstant(Src1OpSel, DL, MVT::i32);
636 SDValue ZeroMods = CurDAG->getTargetConstant(0, DL, MVT::i32);
637
638 CurDAG->SelectNodeTo(N, AMDGPU::V_PK_MOV_B32, N->getVTList(),
639 {Src0OpSelVal, VSrc0, Src1OpSelVal, VSrc1,
640 ZeroMods, // clamp
641 ZeroMods, // op_sel
642 ZeroMods, // op_sel_hi
643 ZeroMods, // neg_lo
644 ZeroMods}); // neg_hi
645 return;
646 }
647
648 SDValue ResultElt0 =
649 CurDAG->getTargetExtractSubreg(Src0SubReg, DL, EltVT, VSrc0);
650 SDValue ResultElt1 =
651 CurDAG->getTargetExtractSubreg(Src1SubReg, DL, EltVT, VSrc1);
652
653 const SDValue Ops[] = {
654 CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
655 ResultElt0, CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
656 ResultElt1, CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
657 CurDAG->SelectNodeTo(N, TargetOpcode::REG_SEQUENCE, VT, Ops);
658}
659
661 unsigned int Opc = N->getOpcode();
662 if (N->isMachineOpcode()) {
663 N->setNodeId(-1);
664 return; // Already selected.
665 }
666
667 // isa<MemSDNode> almost works but is slightly too permissive for some DS
668 // intrinsics.
669 if (Opc == ISD::LOAD || Opc == ISD::STORE || isa<AtomicSDNode>(N)) {
670 N = glueCopyToM0LDSInit(N);
671 SelectCode(N);
672 return;
673 }
674
675 switch (Opc) {
676 default:
677 break;
678 // We are selecting i64 ADD here instead of custom lower it during
679 // DAG legalization, so we can fold some i64 ADDs used for address
680 // calculation into the LOAD and STORE instructions.
681 case ISD::ADDC:
682 case ISD::ADDE:
683 case ISD::SUBC:
684 case ISD::SUBE: {
685 if (N->getValueType(0) != MVT::i64)
686 break;
687
688 SelectADD_SUB_I64(N);
689 return;
690 }
691 case ISD::UADDO_CARRY:
692 case ISD::USUBO_CARRY:
693 if (N->getValueType(0) != MVT::i32)
694 break;
695
696 SelectAddcSubb(N);
697 return;
698 case ISD::UADDO:
699 case ISD::USUBO: {
700 SelectUADDO_USUBO(N);
701 return;
702 }
703 case AMDGPUISD::FMUL_W_CHAIN: {
704 SelectFMUL_W_CHAIN(N);
705 return;
706 }
707 case AMDGPUISD::FMA_W_CHAIN: {
708 SelectFMA_W_CHAIN(N);
709 return;
710 }
711
713 case ISD::BUILD_VECTOR: {
714 EVT VT = N->getValueType(0);
715 unsigned NumVectorElts = VT.getVectorNumElements();
716 if (VT.getScalarSizeInBits() == 16) {
717 if (Opc == ISD::BUILD_VECTOR && NumVectorElts == 2) {
718 if (SDNode *Packed = packConstantV2I16(N, *CurDAG)) {
719 ReplaceNode(N, Packed);
720 return;
721 }
722 }
723
724 break;
725 }
726
727 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
728 assert(VT.getVectorElementType().bitsEq(MVT::i32));
729 const TargetRegisterClass *RegClass =
730 N->isDivergent()
731 ? TRI->getDefaultVectorSuperClassForBitWidth(NumVectorElts * 32)
732 : SIRegisterInfo::getSGPRClassForBitWidth(NumVectorElts * 32);
733
734 SelectBuildVector(N, RegClass->getID());
735 return;
736 }
739 return;
740 case ISD::BUILD_PAIR: {
741 SDValue RC, SubReg0, SubReg1;
742 SDLoc DL(N);
743 if (N->getValueType(0) == MVT::i128) {
744 RC = CurDAG->getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32);
745 SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32);
746 SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32);
747 } else if (N->getValueType(0) == MVT::i64) {
748 RC = CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32);
749 SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
750 SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
751 } else {
752 llvm_unreachable("Unhandled value type for BUILD_PAIR");
753 }
754 const SDValue Ops[] = { RC, N->getOperand(0), SubReg0,
755 N->getOperand(1), SubReg1 };
756 ReplaceNode(N, CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL,
757 N->getValueType(0), Ops));
758 return;
759 }
760
761 case ISD::Constant:
762 case ISD::ConstantFP: {
763 if (N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N) ||
764 Subtarget->has64BitLiterals())
765 break;
766
767 uint64_t Imm;
769 Imm = FP->getValueAPF().bitcastToAPInt().getZExtValue();
770 if (AMDGPU::isValid32BitLiteral(Imm, true))
771 break;
772 } else {
774 Imm = C->getZExtValue();
775 if (AMDGPU::isValid32BitLiteral(Imm, false))
776 break;
777 }
778
779 SDLoc DL(N);
780 ReplaceNode(N, buildSMovImm64(DL, Imm, N->getValueType(0)));
781 return;
782 }
783 case AMDGPUISD::BFE_I32:
784 case AMDGPUISD::BFE_U32: {
785 // There is a scalar version available, but unlike the vector version which
786 // has a separate operand for the offset and width, the scalar version packs
787 // the width and offset into a single operand. Try to move to the scalar
788 // version if the offsets are constant, so that we can try to keep extended
789 // loads of kernel arguments in SGPRs.
790
791 // TODO: Technically we could try to pattern match scalar bitshifts of
792 // dynamic values, but it's probably not useful.
794 if (!Offset)
795 break;
796
797 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
798 if (!Width)
799 break;
800
801 bool Signed = Opc == AMDGPUISD::BFE_I32;
802
803 uint32_t OffsetVal = Offset->getZExtValue();
804 uint32_t WidthVal = Width->getZExtValue();
805
806 ReplaceNode(N, getBFE32(Signed, SDLoc(N), N->getOperand(0), OffsetVal,
807 WidthVal));
808 return;
809 }
810 case AMDGPUISD::DIV_SCALE: {
811 SelectDIV_SCALE(N);
812 return;
813 }
816 SelectMAD_64_32(N);
817 return;
818 }
819 case ISD::SMUL_LOHI:
820 case ISD::UMUL_LOHI:
821 return SelectMUL_LOHI(N);
822 case ISD::CopyToReg: {
824 *static_cast<const SITargetLowering*>(getTargetLowering());
825 N = Lowering.legalizeTargetIndependentNode(N, *CurDAG);
826 break;
827 }
828 case ISD::AND:
829 case ISD::SRL:
830 case ISD::SRA:
832 if (N->getValueType(0) != MVT::i32)
833 break;
834
835 SelectS_BFE(N);
836 return;
837 case ISD::BRCOND:
838 SelectBRCOND(N);
839 return;
840 case ISD::FP_EXTEND:
841 SelectFP_EXTEND(N);
842 return;
843 case AMDGPUISD::CVT_PKRTZ_F16_F32:
844 case AMDGPUISD::CVT_PKNORM_I16_F32:
845 case AMDGPUISD::CVT_PKNORM_U16_F32:
846 case AMDGPUISD::CVT_PK_U16_U32:
847 case AMDGPUISD::CVT_PK_I16_I32: {
848 // Hack around using a legal type if f16 is illegal.
849 if (N->getValueType(0) == MVT::i32) {
850 MVT NewVT = Opc == AMDGPUISD::CVT_PKRTZ_F16_F32 ? MVT::v2f16 : MVT::v2i16;
851 N = CurDAG->MorphNodeTo(N, N->getOpcode(), CurDAG->getVTList(NewVT),
852 { N->getOperand(0), N->getOperand(1) });
853 SelectCode(N);
854 return;
855 }
856
857 break;
858 }
860 SelectINTRINSIC_W_CHAIN(N);
861 return;
862 }
864 SelectINTRINSIC_WO_CHAIN(N);
865 return;
866 }
867 case ISD::INTRINSIC_VOID: {
868 SelectINTRINSIC_VOID(N);
869 return;
870 }
872 SelectWAVE_ADDRESS(N);
873 return;
874 }
875 case ISD::STACKRESTORE: {
876 SelectSTACKRESTORE(N);
877 return;
878 }
879 }
880
881 SelectCode(N);
882}
883
884bool AMDGPUDAGToDAGISel::isUniformBr(const SDNode *N) const {
885 const BasicBlock *BB = FuncInfo->MBB->getBasicBlock();
886 const Instruction *Term = BB->getTerminator();
887 return Term->getMetadata("amdgpu.uniform") ||
888 Term->getMetadata("structurizecfg.uniform");
889}
890
891bool AMDGPUDAGToDAGISel::isUnneededShiftMask(const SDNode *N,
892 unsigned ShAmtBits) const {
893 assert(N->getOpcode() == ISD::AND);
894
895 const APInt &RHS = N->getConstantOperandAPInt(1);
896 if (RHS.countr_one() >= ShAmtBits)
897 return true;
898
899 const APInt &LHSKnownZeros = CurDAG->computeKnownBits(N->getOperand(0)).Zero;
900 return (LHSKnownZeros | RHS).countr_one() >= ShAmtBits;
901}
902
904 SDValue &N0, SDValue &N1) {
905 if (Addr.getValueType() == MVT::i64 && Addr.getOpcode() == ISD::BITCAST &&
907 // As we split 64-bit `or` earlier, it's complicated pattern to match, i.e.
908 // (i64 (bitcast (v2i32 (build_vector
909 // (or (extract_vector_elt V, 0), OFFSET),
910 // (extract_vector_elt V, 1)))))
911 SDValue Lo = Addr.getOperand(0).getOperand(0);
912 if (Lo.getOpcode() == ISD::OR && DAG.isBaseWithConstantOffset(Lo)) {
913 SDValue BaseLo = Lo.getOperand(0);
914 SDValue BaseHi = Addr.getOperand(0).getOperand(1);
915 // Check that split base (Lo and Hi) are extracted from the same one.
916 if (BaseLo.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
918 BaseLo.getOperand(0) == BaseHi.getOperand(0) &&
919 // Lo is statically extracted from index 0.
920 isa<ConstantSDNode>(BaseLo.getOperand(1)) &&
921 BaseLo.getConstantOperandVal(1) == 0 &&
922 // Hi is statically extracted from index 0.
923 isa<ConstantSDNode>(BaseHi.getOperand(1)) &&
924 BaseHi.getConstantOperandVal(1) == 1) {
925 N0 = BaseLo.getOperand(0).getOperand(0);
926 N1 = Lo.getOperand(1);
927 return true;
928 }
929 }
930 }
931 return false;
932}
933
934bool AMDGPUDAGToDAGISel::isBaseWithConstantOffset64(SDValue Addr, SDValue &LHS,
935 SDValue &RHS) const {
936 if (CurDAG->isBaseWithConstantOffset(Addr)) {
937 LHS = Addr.getOperand(0);
938 RHS = Addr.getOperand(1);
939 return true;
940 }
941
944 return true;
945 }
946
947 return false;
948}
949
951 return "AMDGPU DAG->DAG Pattern Instruction Selection";
952}
953
957
961#ifdef EXPENSIVE_CHECKS
963 .getManager();
964 auto &F = MF.getFunction();
965 DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
966 LoopInfo &LI = FAM.getResult<LoopAnalysis>(F);
967 for (auto &L : LI.getLoopsInPreorder())
968 assert(L->isLCSSAForm(DT) && "Loop is not in LCSSA form!");
969#endif
970 return SelectionDAGISelPass::run(MF, MFAM);
971}
972
973//===----------------------------------------------------------------------===//
974// Complex Patterns
975//===----------------------------------------------------------------------===//
976
977bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
978 SDValue &Offset) {
979 return false;
980}
981
982bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
983 SDValue &Offset) {
985 SDLoc DL(Addr);
986
987 if ((C = dyn_cast<ConstantSDNode>(Addr))) {
988 Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
989 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
990 } else if ((Addr.getOpcode() == AMDGPUISD::DWORDADDR) &&
992 Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
993 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
994 } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
996 Base = Addr.getOperand(0);
997 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
998 } else {
999 Base = Addr;
1000 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1001 }
1002
1003 return true;
1004}
1005
1006SDValue AMDGPUDAGToDAGISel::getMaterializedScalarImm32(int64_t Val,
1007 const SDLoc &DL) const {
1008 SDNode *Mov = CurDAG->getMachineNode(
1009 AMDGPU::S_MOV_B32, DL, MVT::i32,
1010 CurDAG->getTargetConstant(Val, DL, MVT::i32));
1011 return SDValue(Mov, 0);
1012}
1013
1014// FIXME: Should only handle uaddo_carry/usubo_carry
1015void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
1016 SDLoc DL(N);
1017 SDValue LHS = N->getOperand(0);
1018 SDValue RHS = N->getOperand(1);
1019
1020 unsigned Opcode = N->getOpcode();
1021 bool ConsumeCarry = (Opcode == ISD::ADDE || Opcode == ISD::SUBE);
1022 bool ProduceCarry =
1023 ConsumeCarry || Opcode == ISD::ADDC || Opcode == ISD::SUBC;
1024 bool IsAdd = Opcode == ISD::ADD || Opcode == ISD::ADDC || Opcode == ISD::ADDE;
1025
1026 SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
1027 SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
1028
1029 SDNode *Lo0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1030 DL, MVT::i32, LHS, Sub0);
1031 SDNode *Hi0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1032 DL, MVT::i32, LHS, Sub1);
1033
1034 SDNode *Lo1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1035 DL, MVT::i32, RHS, Sub0);
1036 SDNode *Hi1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1037 DL, MVT::i32, RHS, Sub1);
1038
1039 SDVTList VTList = CurDAG->getVTList(MVT::i32, MVT::Glue);
1040
1041 static const unsigned OpcMap[2][2][2] = {
1042 {{AMDGPU::S_SUB_U32, AMDGPU::S_ADD_U32},
1043 {AMDGPU::V_SUB_CO_U32_e32, AMDGPU::V_ADD_CO_U32_e32}},
1044 {{AMDGPU::S_SUBB_U32, AMDGPU::S_ADDC_U32},
1045 {AMDGPU::V_SUBB_U32_e32, AMDGPU::V_ADDC_U32_e32}}};
1046
1047 unsigned Opc = OpcMap[0][N->isDivergent()][IsAdd];
1048 unsigned CarryOpc = OpcMap[1][N->isDivergent()][IsAdd];
1049
1050 SDNode *AddLo;
1051 if (!ConsumeCarry) {
1052 SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0) };
1053 AddLo = CurDAG->getMachineNode(Opc, DL, VTList, Args);
1054 } else {
1055 SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0), N->getOperand(2) };
1056 AddLo = CurDAG->getMachineNode(CarryOpc, DL, VTList, Args);
1057 }
1058 SDValue AddHiArgs[] = {
1059 SDValue(Hi0, 0),
1060 SDValue(Hi1, 0),
1061 SDValue(AddLo, 1)
1062 };
1063 SDNode *AddHi = CurDAG->getMachineNode(CarryOpc, DL, VTList, AddHiArgs);
1064
1065 SDValue RegSequenceArgs[] = {
1066 CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
1067 SDValue(AddLo,0),
1068 Sub0,
1069 SDValue(AddHi,0),
1070 Sub1,
1071 };
1072 SDNode *RegSequence = CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
1073 MVT::i64, RegSequenceArgs);
1074
1075 if (ProduceCarry) {
1076 // Replace the carry-use
1077 ReplaceUses(SDValue(N, 1), SDValue(AddHi, 1));
1078 }
1079
1080 // Replace the remaining uses.
1081 ReplaceNode(N, RegSequence);
1082}
1083
1084void AMDGPUDAGToDAGISel::SelectAddcSubb(SDNode *N) {
1085 SDValue LHS = N->getOperand(0);
1086 SDValue RHS = N->getOperand(1);
1087 SDValue CI = N->getOperand(2);
1088
1089 if (N->isDivergent()) {
1090 unsigned Opc = N->getOpcode() == ISD::UADDO_CARRY ? AMDGPU::V_ADDC_U32_e64
1091 : AMDGPU::V_SUBB_U32_e64;
1092 CurDAG->SelectNodeTo(
1093 N, Opc, N->getVTList(),
1094 {LHS, RHS, CI,
1095 CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
1096 } else {
1097 unsigned Opc = N->getOpcode() == ISD::UADDO_CARRY ? AMDGPU::S_ADD_CO_PSEUDO
1098 : AMDGPU::S_SUB_CO_PSEUDO;
1099 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), {LHS, RHS, CI});
1100 }
1101}
1102
1103void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) {
1104 // The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned
1105 // carry out despite the _i32 name. These were renamed in VI to _U32.
1106 // FIXME: We should probably rename the opcodes here.
1107 bool IsAdd = N->getOpcode() == ISD::UADDO;
1108 bool IsVALU = N->isDivergent();
1109
1110 for (SDNode::user_iterator UI = N->user_begin(), E = N->user_end(); UI != E;
1111 ++UI)
1112 if (UI.getUse().getResNo() == 1) {
1113 if (UI->isMachineOpcode()) {
1114 if (UI->getMachineOpcode() !=
1115 (IsAdd ? AMDGPU::S_ADD_CO_PSEUDO : AMDGPU::S_SUB_CO_PSEUDO)) {
1116 IsVALU = true;
1117 break;
1118 }
1119 } else {
1120 if (UI->getOpcode() != (IsAdd ? ISD::UADDO_CARRY : ISD::USUBO_CARRY)) {
1121 IsVALU = true;
1122 break;
1123 }
1124 }
1125 }
1126
1127 if (IsVALU) {
1128 unsigned Opc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
1129
1130 CurDAG->SelectNodeTo(
1131 N, Opc, N->getVTList(),
1132 {N->getOperand(0), N->getOperand(1),
1133 CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
1134 } else {
1135 unsigned Opc = IsAdd ? AMDGPU::S_UADDO_PSEUDO : AMDGPU::S_USUBO_PSEUDO;
1136
1137 CurDAG->SelectNodeTo(N, Opc, N->getVTList(),
1138 {N->getOperand(0), N->getOperand(1)});
1139 }
1140}
1141
1142void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) {
1143 // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp, omod
1144 SDValue Ops[10];
1145
1146 SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[6], Ops[7]);
1147 SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
1148 SelectVOP3Mods(N->getOperand(3), Ops[5], Ops[4]);
1149 Ops[8] = N->getOperand(0);
1150 Ops[9] = N->getOperand(4);
1151
1152 // If there are no source modifiers, prefer fmac over fma because it can use
1153 // the smaller VOP2 encoding.
1154 bool UseFMAC = Subtarget->hasDLInsts() &&
1155 cast<ConstantSDNode>(Ops[0])->isZero() &&
1156 cast<ConstantSDNode>(Ops[2])->isZero() &&
1157 cast<ConstantSDNode>(Ops[4])->isZero();
1158 unsigned Opcode = UseFMAC ? AMDGPU::V_FMAC_F32_e64 : AMDGPU::V_FMA_F32_e64;
1159 CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), Ops);
1160}
1161
1162void AMDGPUDAGToDAGISel::SelectFMUL_W_CHAIN(SDNode *N) {
1163 // src0_modifiers, src0, src1_modifiers, src1, clamp, omod
1164 SDValue Ops[8];
1165
1166 SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[4], Ops[5]);
1167 SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
1168 Ops[6] = N->getOperand(0);
1169 Ops[7] = N->getOperand(3);
1170
1171 CurDAG->SelectNodeTo(N, AMDGPU::V_MUL_F32_e64, N->getVTList(), Ops);
1172}
1173
1174// We need to handle this here because tablegen doesn't support matching
1175// instructions with multiple outputs.
1176void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
1177 EVT VT = N->getValueType(0);
1178
1179 assert(VT == MVT::f32 || VT == MVT::f64);
1180
1181 unsigned Opc
1182 = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64_e64 : AMDGPU::V_DIV_SCALE_F32_e64;
1183
1184 // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp,
1185 // omod
1186 SDValue Ops[8];
1187 SelectVOP3BMods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]);
1188 SelectVOP3BMods(N->getOperand(1), Ops[3], Ops[2]);
1189 SelectVOP3BMods(N->getOperand(2), Ops[5], Ops[4]);
1190 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
1191}
1192
1193// We need to handle this here because tablegen doesn't support matching
1194// instructions with multiple outputs.
1195void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {
1196 SDLoc SL(N);
1197 bool Signed = N->getOpcode() == AMDGPUISD::MAD_I64_I32;
1198 unsigned Opc;
1199 bool UseNoCarry = Subtarget->hasMadU64U32NoCarry() && !N->hasAnyUseOfValue(1);
1200 if (Subtarget->hasMADIntraFwdBug())
1201 Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
1202 : AMDGPU::V_MAD_U64_U32_gfx11_e64;
1203 else if (UseNoCarry)
1204 Opc = Signed ? AMDGPU::V_MAD_NC_I64_I32_e64 : AMDGPU::V_MAD_NC_U64_U32_e64;
1205 else
1206 Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
1207
1208 SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
1209 SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
1210 Clamp };
1211
1212 if (UseNoCarry) {
1213 MachineSDNode *Mad = CurDAG->getMachineNode(Opc, SL, MVT::i64, Ops);
1214 ReplaceUses(SDValue(N, 0), SDValue(Mad, 0));
1215 CurDAG->RemoveDeadNode(N);
1216 return;
1217 }
1218
1219 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
1220}
1221
1222// We need to handle this here because tablegen doesn't support matching
1223// instructions with multiple outputs.
1224void AMDGPUDAGToDAGISel::SelectMUL_LOHI(SDNode *N) {
1225 SDLoc SL(N);
1226 bool Signed = N->getOpcode() == ISD::SMUL_LOHI;
1227 SDVTList VTList;
1228 unsigned Opc;
1229 if (Subtarget->hasMadU64U32NoCarry()) {
1230 VTList = CurDAG->getVTList(MVT::i64);
1231 Opc = Signed ? AMDGPU::V_MAD_NC_I64_I32_e64 : AMDGPU::V_MAD_NC_U64_U32_e64;
1232 } else {
1233 VTList = CurDAG->getVTList(MVT::i64, MVT::i1);
1234 if (Subtarget->hasMADIntraFwdBug()) {
1235 Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
1236 : AMDGPU::V_MAD_U64_U32_gfx11_e64;
1237 } else {
1238 Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
1239 }
1240 }
1241
1242 SDValue Zero = CurDAG->getTargetConstant(0, SL, MVT::i64);
1243 SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
1244 SDValue Ops[] = {N->getOperand(0), N->getOperand(1), Zero, Clamp};
1245 SDNode *Mad = CurDAG->getMachineNode(Opc, SL, VTList, Ops);
1246 if (!SDValue(N, 0).use_empty()) {
1247 SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32);
1248 SDNode *Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL,
1249 MVT::i32, SDValue(Mad, 0), Sub0);
1250 ReplaceUses(SDValue(N, 0), SDValue(Lo, 0));
1251 }
1252 if (!SDValue(N, 1).use_empty()) {
1253 SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32);
1254 SDNode *Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL,
1255 MVT::i32, SDValue(Mad, 0), Sub1);
1256 ReplaceUses(SDValue(N, 1), SDValue(Hi, 0));
1257 }
1258 CurDAG->RemoveDeadNode(N);
1259}
1260
1261bool AMDGPUDAGToDAGISel::isDSOffsetLegal(SDValue Base, unsigned Offset) const {
1262 if (!isUInt<16>(Offset))
1263 return false;
1264
1265 if (!Base || Subtarget->hasUsableDSOffset() ||
1266 Subtarget->unsafeDSOffsetFoldingEnabled())
1267 return true;
1268
1269 // On Southern Islands instruction with a negative base value and an offset
1270 // don't seem to work.
1271 return CurDAG->SignBitIsZero(Base);
1272}
1273
1274bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
1275 SDValue &Offset) const {
1276 SDLoc DL(Addr);
1277 if (CurDAG->isBaseWithConstantOffset(Addr)) {
1278 SDValue N0 = Addr.getOperand(0);
1279 SDValue N1 = Addr.getOperand(1);
1280 ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1281 if (isDSOffsetLegal(N0, C1->getSExtValue())) {
1282 // (add n0, c0)
1283 Base = N0;
1284 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
1285 return true;
1286 }
1287 } else if (Addr.getOpcode() == ISD::SUB) {
1288 // sub C, x -> add (sub 0, x), C
1289 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
1290 int64_t ByteOffset = C->getSExtValue();
1291 if (isDSOffsetLegal(SDValue(), ByteOffset)) {
1292 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1293
1294 // XXX - This is kind of hacky. Create a dummy sub node so we can check
1295 // the known bits in isDSOffsetLegal. We need to emit the selected node
1296 // here, so this is thrown away.
1297 SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32,
1298 Zero, Addr.getOperand(1));
1299
1300 if (isDSOffsetLegal(Sub, ByteOffset)) {
1302 Opnds.push_back(Zero);
1303 Opnds.push_back(Addr.getOperand(1));
1304
1305 // FIXME: Select to VOP3 version for with-carry.
1306 unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1307 if (Subtarget->hasAddNoCarryInsts()) {
1308 SubOp = AMDGPU::V_SUB_U32_e64;
1309 Opnds.push_back(
1310 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
1311 }
1312
1313 MachineSDNode *MachineSub =
1314 CurDAG->getMachineNode(SubOp, DL, MVT::i32, Opnds);
1315
1316 Base = SDValue(MachineSub, 0);
1317 Offset = CurDAG->getTargetConstant(ByteOffset, DL, MVT::i16);
1318 return true;
1319 }
1320 }
1321 }
1322 } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1323 // If we have a constant address, prefer to put the constant into the
1324 // offset. This can save moves to load the constant address since multiple
1325 // operations can share the zero base address register, and enables merging
1326 // into read2 / write2 instructions.
1327
1328 SDLoc DL(Addr);
1329
1330 if (isDSOffsetLegal(SDValue(), CAddr->getZExtValue())) {
1331 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1332 MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
1333 DL, MVT::i32, Zero);
1334 Base = SDValue(MovZero, 0);
1335 Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16);
1336 return true;
1337 }
1338 }
1339
1340 // default case
1341 Base = Addr;
1342 Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i16);
1343 return true;
1344}
1345
1346bool AMDGPUDAGToDAGISel::isDSOffset2Legal(SDValue Base, unsigned Offset0,
1347 unsigned Offset1,
1348 unsigned Size) const {
1349 if (Offset0 % Size != 0 || Offset1 % Size != 0)
1350 return false;
1351 if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
1352 return false;
1353
1354 if (!Base || Subtarget->hasUsableDSOffset() ||
1355 Subtarget->unsafeDSOffsetFoldingEnabled())
1356 return true;
1357
1358 // On Southern Islands instruction with a negative base value and an offset
1359 // don't seem to work.
1360 return CurDAG->SignBitIsZero(Base);
1361}
1362
1363// Return whether the operation has NoUnsignedWrap property.
1364static bool isNoUnsignedWrap(SDValue Addr) {
1365 return (Addr.getOpcode() == ISD::ADD &&
1366 Addr->getFlags().hasNoUnsignedWrap()) ||
1367 Addr->getOpcode() == ISD::OR;
1368}
1369
1370// Check that the base address of flat scratch load/store in the form of `base +
1371// offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware
1372// requirement). We always treat the first operand as the base address here.
1373bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue Addr) const {
1374 if (isNoUnsignedWrap(Addr))
1375 return true;
1376
1377 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1378 // values.
1379 if (Subtarget->hasSignedScratchOffsets())
1380 return true;
1381
1382 auto LHS = Addr.getOperand(0);
1383 auto RHS = Addr.getOperand(1);
1384
1385 // If the immediate offset is negative and within certain range, the base
1386 // address cannot also be negative. If the base is also negative, the sum
1387 // would be either negative or much larger than the valid range of scratch
1388 // memory a thread can access.
1389 ConstantSDNode *ImmOp = nullptr;
1390 if (Addr.getOpcode() == ISD::ADD && (ImmOp = dyn_cast<ConstantSDNode>(RHS))) {
1391 if (ImmOp->getSExtValue() < 0 && ImmOp->getSExtValue() > -0x40000000)
1392 return true;
1393 }
1394
1395 return CurDAG->SignBitIsZero(LHS);
1396}
1397
1398// Check address value in SGPR/VGPR are legal for flat scratch in the form
1399// of: SGPR + VGPR.
1400bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSV(SDValue Addr) const {
1401 if (isNoUnsignedWrap(Addr))
1402 return true;
1403
1404 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1405 // values.
1406 if (Subtarget->hasSignedScratchOffsets())
1407 return true;
1408
1409 auto LHS = Addr.getOperand(0);
1410 auto RHS = Addr.getOperand(1);
1411 return CurDAG->SignBitIsZero(RHS) && CurDAG->SignBitIsZero(LHS);
1412}
1413
1414// Check address value in SGPR/VGPR are legal for flat scratch in the form
1415// of: SGPR + VGPR + Imm.
1416bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSVImm(SDValue Addr) const {
1417 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1418 // values.
1419 if (AMDGPU::isGFX12Plus(*Subtarget))
1420 return true;
1421
1422 auto Base = Addr.getOperand(0);
1423 auto *RHSImm = cast<ConstantSDNode>(Addr.getOperand(1));
1424 // If the immediate offset is negative and within certain range, the base
1425 // address cannot also be negative. If the base is also negative, the sum
1426 // would be either negative or much larger than the valid range of scratch
1427 // memory a thread can access.
1428 if (isNoUnsignedWrap(Base) &&
1429 (isNoUnsignedWrap(Addr) ||
1430 (RHSImm->getSExtValue() < 0 && RHSImm->getSExtValue() > -0x40000000)))
1431 return true;
1432
1433 auto LHS = Base.getOperand(0);
1434 auto RHS = Base.getOperand(1);
1435 return CurDAG->SignBitIsZero(RHS) && CurDAG->SignBitIsZero(LHS);
1436}
1437
1438// TODO: If offset is too big, put low 16-bit into offset.
1439bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,
1440 SDValue &Offset0,
1441 SDValue &Offset1) const {
1442 return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 4);
1443}
1444
1445bool AMDGPUDAGToDAGISel::SelectDS128Bit8ByteAligned(SDValue Addr, SDValue &Base,
1446 SDValue &Offset0,
1447 SDValue &Offset1) const {
1448 return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 8);
1449}
1450
1451bool AMDGPUDAGToDAGISel::SelectDSReadWrite2(SDValue Addr, SDValue &Base,
1452 SDValue &Offset0, SDValue &Offset1,
1453 unsigned Size) const {
1454 SDLoc DL(Addr);
1455
1456 if (CurDAG->isBaseWithConstantOffset(Addr)) {
1457 SDValue N0 = Addr.getOperand(0);
1458 SDValue N1 = Addr.getOperand(1);
1459 ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1460 unsigned OffsetValue0 = C1->getZExtValue();
1461 unsigned OffsetValue1 = OffsetValue0 + Size;
1462
1463 // (add n0, c0)
1464 if (isDSOffset2Legal(N0, OffsetValue0, OffsetValue1, Size)) {
1465 Base = N0;
1466 Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i32);
1467 Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i32);
1468 return true;
1469 }
1470 } else if (Addr.getOpcode() == ISD::SUB) {
1471 // sub C, x -> add (sub 0, x), C
1472 if (const ConstantSDNode *C =
1474 unsigned OffsetValue0 = C->getZExtValue();
1475 unsigned OffsetValue1 = OffsetValue0 + Size;
1476
1477 if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {
1478 SDLoc DL(Addr);
1479 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1480
1481 // XXX - This is kind of hacky. Create a dummy sub node so we can check
1482 // the known bits in isDSOffsetLegal. We need to emit the selected node
1483 // here, so this is thrown away.
1484 SDValue Sub =
1485 CurDAG->getNode(ISD::SUB, DL, MVT::i32, Zero, Addr.getOperand(1));
1486
1487 if (isDSOffset2Legal(Sub, OffsetValue0, OffsetValue1, Size)) {
1489 Opnds.push_back(Zero);
1490 Opnds.push_back(Addr.getOperand(1));
1491 unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1492 if (Subtarget->hasAddNoCarryInsts()) {
1493 SubOp = AMDGPU::V_SUB_U32_e64;
1494 Opnds.push_back(
1495 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
1496 }
1497
1498 MachineSDNode *MachineSub = CurDAG->getMachineNode(
1499 SubOp, DL, MVT::getIntegerVT(Size * 8), Opnds);
1500
1501 Base = SDValue(MachineSub, 0);
1502 Offset0 =
1503 CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i32);
1504 Offset1 =
1505 CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i32);
1506 return true;
1507 }
1508 }
1509 }
1510 } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1511 unsigned OffsetValue0 = CAddr->getZExtValue();
1512 unsigned OffsetValue1 = OffsetValue0 + Size;
1513
1514 if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {
1515 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1516 MachineSDNode *MovZero =
1517 CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32, Zero);
1518 Base = SDValue(MovZero, 0);
1519 Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i32);
1520 Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i32);
1521 return true;
1522 }
1523 }
1524
1525 // default case
1526
1527 Base = Addr;
1528 Offset0 = CurDAG->getTargetConstant(0, DL, MVT::i32);
1529 Offset1 = CurDAG->getTargetConstant(1, DL, MVT::i32);
1530 return true;
1531}
1532
1533bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDValue &VAddr,
1534 SDValue &SOffset, SDValue &Offset,
1535 SDValue &Offen, SDValue &Idxen,
1536 SDValue &Addr64) const {
1537 // Subtarget prefers to use flat instruction
1538 // FIXME: This should be a pattern predicate and not reach here
1539 if (Subtarget->useFlatForGlobal())
1540 return false;
1541
1542 SDLoc DL(Addr);
1543
1544 Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1);
1545 Offen = CurDAG->getTargetConstant(0, DL, MVT::i1);
1546 Addr64 = CurDAG->getTargetConstant(0, DL, MVT::i1);
1547 SOffset = Subtarget->hasRestrictedSOffset()
1548 ? CurDAG->getRegister(AMDGPU::SGPR_NULL, MVT::i32)
1549 : CurDAG->getTargetConstant(0, DL, MVT::i32);
1550
1551 ConstantSDNode *C1 = nullptr;
1552 SDValue N0 = Addr;
1553 if (CurDAG->isBaseWithConstantOffset(Addr)) {
1554 C1 = cast<ConstantSDNode>(Addr.getOperand(1));
1555 if (isUInt<32>(C1->getZExtValue()))
1556 N0 = Addr.getOperand(0);
1557 else
1558 C1 = nullptr;
1559 }
1560
1561 if (N0->isAnyAdd()) {
1562 // (add N2, N3) -> addr64, or
1563 // (add (add N2, N3), C1) -> addr64
1564 SDValue N2 = N0.getOperand(0);
1565 SDValue N3 = N0.getOperand(1);
1566 Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
1567
1568 if (N2->isDivergent()) {
1569 if (N3->isDivergent()) {
1570 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
1571 // addr64, and construct the resource from a 0 address.
1572 Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
1573 VAddr = N0;
1574 } else {
1575 // N2 is divergent, N3 is not.
1576 Ptr = N3;
1577 VAddr = N2;
1578 }
1579 } else {
1580 // N2 is not divergent.
1581 Ptr = N2;
1582 VAddr = N3;
1583 }
1584 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1585 } else if (N0->isDivergent()) {
1586 // N0 is divergent. Use it as the addr64, and construct the resource from a
1587 // 0 address.
1588 Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
1589 VAddr = N0;
1590 Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
1591 } else {
1592 // N0 -> offset, or
1593 // (N0 + C1) -> offset
1594 VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32);
1595 Ptr = N0;
1596 }
1597
1598 if (!C1) {
1599 // No offset.
1600 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1601 return true;
1602 }
1603
1604 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1605 if (TII->isLegalMUBUFImmOffset(C1->getZExtValue())) {
1606 // Legal offset for instruction.
1607 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
1608 return true;
1609 }
1610
1611 // Illegal offset, store it in soffset.
1612 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1613 SOffset =
1614 SDValue(CurDAG->getMachineNode(
1615 AMDGPU::S_MOV_B32, DL, MVT::i32,
1616 CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32)),
1617 0);
1618 return true;
1619}
1620
1621bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
1622 SDValue &VAddr, SDValue &SOffset,
1623 SDValue &Offset) const {
1624 SDValue Ptr, Offen, Idxen, Addr64;
1625
1626 // addr64 bit was removed for volcanic islands.
1627 // FIXME: This should be a pattern predicate and not reach here
1628 if (!Subtarget->hasAddr64())
1629 return false;
1630
1631 if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
1632 return false;
1633
1634 ConstantSDNode *C = cast<ConstantSDNode>(Addr64);
1635 if (C->getSExtValue()) {
1636 SDLoc DL(Addr);
1637
1638 const SITargetLowering& Lowering =
1639 *static_cast<const SITargetLowering*>(getTargetLowering());
1640
1641 SRsrc = SDValue(Lowering.wrapAddr64Rsrc(*CurDAG, DL, Ptr), 0);
1642 return true;
1643 }
1644
1645 return false;
1646}
1647
1648std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const {
1649 SDLoc DL(N);
1650
1651 auto *FI = dyn_cast<FrameIndexSDNode>(N);
1652 SDValue TFI =
1653 FI ? CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0)) : N;
1654
1655 // We rebase the base address into an absolute stack address and hence
1656 // use constant 0 for soffset. This value must be retained until
1657 // frame elimination and eliminateFrameIndex will choose the appropriate
1658 // frame register if need be.
1659 return std::pair(TFI, CurDAG->getTargetConstant(0, DL, MVT::i32));
1660}
1661
1662bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent,
1663 SDValue Addr, SDValue &Rsrc,
1664 SDValue &VAddr, SDValue &SOffset,
1665 SDValue &ImmOffset) const {
1666
1667 SDLoc DL(Addr);
1668 MachineFunction &MF = CurDAG->getMachineFunction();
1669 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1670
1671 Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1672
1673 if (ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1674 int64_t Imm = CAddr->getSExtValue();
1675 const int64_t NullPtr =
1677 // Don't fold null pointer.
1678 if (Imm != NullPtr) {
1679 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
1680 SDValue HighBits =
1681 CurDAG->getTargetConstant(Imm & ~MaxOffset, DL, MVT::i32);
1682 MachineSDNode *MovHighBits = CurDAG->getMachineNode(
1683 AMDGPU::V_MOV_B32_e32, DL, MVT::i32, HighBits);
1684 VAddr = SDValue(MovHighBits, 0);
1685
1686 SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1687 ImmOffset = CurDAG->getTargetConstant(Imm & MaxOffset, DL, MVT::i32);
1688 return true;
1689 }
1690 }
1691
1692 if (CurDAG->isBaseWithConstantOffset(Addr)) {
1693 // (add n0, c1)
1694
1695 SDValue N0 = Addr.getOperand(0);
1696 uint64_t C1 = Addr.getConstantOperandVal(1);
1697
1698 // Offsets in vaddr must be positive if range checking is enabled.
1699 //
1700 // The total computation of vaddr + soffset + offset must not overflow. If
1701 // vaddr is negative, even if offset is 0 the sgpr offset add will end up
1702 // overflowing.
1703 //
1704 // Prior to gfx9, MUBUF instructions with the vaddr offset enabled would
1705 // always perform a range check. If a negative vaddr base index was used,
1706 // this would fail the range check. The overall address computation would
1707 // compute a valid address, but this doesn't happen due to the range
1708 // check. For out-of-bounds MUBUF loads, a 0 is returned.
1709 //
1710 // Therefore it should be safe to fold any VGPR offset on gfx9 into the
1711 // MUBUF vaddr, but not on older subtargets which can only do this if the
1712 // sign bit is known 0.
1713 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1714 if (TII->isLegalMUBUFImmOffset(C1) &&
1715 (!Subtarget->privateMemoryResourceIsRangeChecked() ||
1716 CurDAG->SignBitIsZero(N0))) {
1717 std::tie(VAddr, SOffset) = foldFrameIndex(N0);
1718 ImmOffset = CurDAG->getTargetConstant(C1, DL, MVT::i32);
1719 return true;
1720 }
1721 }
1722
1723 // (node)
1724 std::tie(VAddr, SOffset) = foldFrameIndex(Addr);
1725 ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1726 return true;
1727}
1728
1729static bool IsCopyFromSGPR(const SIRegisterInfo &TRI, SDValue Val) {
1730 if (Val.getOpcode() != ISD::CopyFromReg)
1731 return false;
1732 auto Reg = cast<RegisterSDNode>(Val.getOperand(1))->getReg();
1733 if (!Reg.isPhysical())
1734 return false;
1735 const auto *RC = TRI.getPhysRegBaseClass(Reg);
1736 return RC && TRI.isSGPRClass(RC);
1737}
1738
1739bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent,
1740 SDValue Addr,
1741 SDValue &SRsrc,
1742 SDValue &SOffset,
1743 SDValue &Offset) const {
1744 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
1745 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1746 MachineFunction &MF = CurDAG->getMachineFunction();
1747 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1748 SDLoc DL(Addr);
1749
1750 // CopyFromReg <sgpr>
1751 if (IsCopyFromSGPR(*TRI, Addr)) {
1752 SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1753 SOffset = Addr;
1754 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1755 return true;
1756 }
1757
1758 ConstantSDNode *CAddr;
1759 if (Addr.getOpcode() == ISD::ADD) {
1760 // Add (CopyFromReg <sgpr>) <constant>
1761 CAddr = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
1762 if (!CAddr || !TII->isLegalMUBUFImmOffset(CAddr->getZExtValue()))
1763 return false;
1764 if (!IsCopyFromSGPR(*TRI, Addr.getOperand(0)))
1765 return false;
1766
1767 SOffset = Addr.getOperand(0);
1768 } else if ((CAddr = dyn_cast<ConstantSDNode>(Addr)) &&
1769 TII->isLegalMUBUFImmOffset(CAddr->getZExtValue())) {
1770 // <constant>
1771 SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1772 } else {
1773 return false;
1774 }
1775
1776 SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1777
1778 Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i32);
1779 return true;
1780}
1781
1782bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
1783 SDValue &SOffset, SDValue &Offset
1784 ) const {
1785 SDValue Ptr, VAddr, Offen, Idxen, Addr64;
1786 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1787
1788 if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
1789 return false;
1790
1791 if (!cast<ConstantSDNode>(Offen)->getSExtValue() &&
1792 !cast<ConstantSDNode>(Idxen)->getSExtValue() &&
1793 !cast<ConstantSDNode>(Addr64)->getSExtValue()) {
1794 uint64_t Rsrc = TII->getDefaultRsrcDataFormat() |
1795 maskTrailingOnes<uint64_t>(32); // Size
1796 SDLoc DL(Addr);
1797
1798 const SITargetLowering& Lowering =
1799 *static_cast<const SITargetLowering*>(getTargetLowering());
1800
1801 SRsrc = SDValue(Lowering.buildRSRC(*CurDAG, DL, Ptr, 0, Rsrc), 0);
1802 return true;
1803 }
1804 return false;
1805}
1806
1807bool AMDGPUDAGToDAGISel::SelectBUFSOffset(SDValue ByteOffsetNode,
1808 SDValue &SOffset) const {
1809 if (Subtarget->hasRestrictedSOffset() && isNullConstant(ByteOffsetNode)) {
1810 SOffset = CurDAG->getRegister(AMDGPU::SGPR_NULL, MVT::i32);
1811 return true;
1812 }
1813
1814 SOffset = ByteOffsetNode;
1815 return true;
1816}
1817
1818// Find a load or store from corresponding pattern root.
1819// Roots may be build_vector, bitconvert or their combinations.
1822 if (MemSDNode *MN = dyn_cast<MemSDNode>(N))
1823 return MN;
1825 for (SDValue V : N->op_values())
1826 if (MemSDNode *MN =
1828 return MN;
1829 llvm_unreachable("cannot find MemSDNode in the pattern!");
1830}
1831
1832bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr,
1833 SDValue &VAddr, SDValue &Offset,
1834 uint64_t FlatVariant) const {
1835 int64_t OffsetVal = 0;
1836
1837 unsigned AS = findMemSDNode(N)->getAddressSpace();
1838
1839 bool CanHaveFlatSegmentOffsetBug =
1840 Subtarget->hasFlatSegmentOffsetBug() &&
1841 FlatVariant == SIInstrFlags::FLAT &&
1843
1844 if (Subtarget->hasFlatInstOffsets() && !CanHaveFlatSegmentOffsetBug) {
1845 SDValue N0, N1;
1846 if (isBaseWithConstantOffset64(Addr, N0, N1) &&
1847 (FlatVariant != SIInstrFlags::FlatScratch ||
1848 isFlatScratchBaseLegal(Addr))) {
1849 int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
1850
1851 // Adding the offset to the base address in a FLAT instruction must not
1852 // change the memory aperture in which the address falls. Therefore we can
1853 // only fold offsets from inbounds GEPs into FLAT instructions.
1854 bool IsInBounds =
1855 Addr.getOpcode() == ISD::PTRADD && Addr->getFlags().hasInBounds();
1856 if (COffsetVal == 0 || FlatVariant != SIInstrFlags::FLAT || IsInBounds) {
1857 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1858 if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) {
1859 Addr = N0;
1860 OffsetVal = COffsetVal;
1861 } else {
1862 // If the offset doesn't fit, put the low bits into the offset field
1863 // and add the rest.
1864 //
1865 // For a FLAT instruction the hardware decides whether to access
1866 // global/scratch/shared memory based on the high bits of vaddr,
1867 // ignoring the offset field, so we have to ensure that when we add
1868 // remainder to vaddr it still points into the same underlying object.
1869 // The easiest way to do that is to make sure that we split the offset
1870 // into two pieces that are both >= 0 or both <= 0.
1871
1872 SDLoc DL(N);
1873 uint64_t RemainderOffset;
1874
1875 std::tie(OffsetVal, RemainderOffset) =
1876 TII->splitFlatOffset(COffsetVal, AS, FlatVariant);
1877
1878 SDValue AddOffsetLo =
1879 getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
1880 SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
1881
1882 if (Addr.getValueType().getSizeInBits() == 32) {
1884 Opnds.push_back(N0);
1885 Opnds.push_back(AddOffsetLo);
1886 unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32;
1887 if (Subtarget->hasAddNoCarryInsts()) {
1888 AddOp = AMDGPU::V_ADD_U32_e64;
1889 Opnds.push_back(Clamp);
1890 }
1891 Addr =
1892 SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0);
1893 } else {
1894 // TODO: Should this try to use a scalar add pseudo if the base
1895 // address is uniform and saddr is usable?
1896 SDValue Sub0 =
1897 CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
1898 SDValue Sub1 =
1899 CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
1900
1901 SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1902 DL, MVT::i32, N0, Sub0);
1903 SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1904 DL, MVT::i32, N0, Sub1);
1905
1906 SDValue AddOffsetHi =
1907 getMaterializedScalarImm32(Hi_32(RemainderOffset), DL);
1908
1909 SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1);
1910
1911 SDNode *Add =
1912 CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs,
1913 {AddOffsetLo, SDValue(N0Lo, 0), Clamp});
1914
1915 SDNode *Addc = CurDAG->getMachineNode(
1916 AMDGPU::V_ADDC_U32_e64, DL, VTs,
1917 {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp});
1918
1919 SDValue RegSequenceArgs[] = {
1920 CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL,
1921 MVT::i32),
1922 SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1};
1923
1924 Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
1925 MVT::i64, RegSequenceArgs),
1926 0);
1927 }
1928 }
1929 }
1930 }
1931 }
1932
1933 VAddr = Addr;
1934 Offset = CurDAG->getSignedTargetConstant(OffsetVal, SDLoc(), MVT::i32);
1935 return true;
1936}
1937
1938bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N, SDValue Addr,
1939 SDValue &VAddr,
1940 SDValue &Offset) const {
1941 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, SIInstrFlags::FLAT);
1942}
1943
1944bool AMDGPUDAGToDAGISel::SelectGlobalOffset(SDNode *N, SDValue Addr,
1945 SDValue &VAddr,
1946 SDValue &Offset) const {
1947 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, SIInstrFlags::FlatGlobal);
1948}
1949
1950bool AMDGPUDAGToDAGISel::SelectScratchOffset(SDNode *N, SDValue Addr,
1951 SDValue &VAddr,
1952 SDValue &Offset) const {
1953 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset,
1955}
1956
1957// If this matches *_extend i32:x, return x
1958// Otherwise if the value is I32 returns x.
1960 const SelectionDAG *DAG) {
1961 if (Op.getValueType() == MVT::i32)
1962 return Op;
1963
1964 if (Op.getOpcode() != (IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND) &&
1965 Op.getOpcode() != ISD::ANY_EXTEND &&
1966 !(DAG->SignBitIsZero(Op) &&
1967 Op.getOpcode() == (IsSigned ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND)))
1968 return SDValue();
1969
1970 SDValue ExtSrc = Op.getOperand(0);
1971 return (ExtSrc.getValueType() == MVT::i32) ? ExtSrc : SDValue();
1972}
1973
1974// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
1975// or (64-bit SGPR base) + (sext vgpr offset) + sext(imm offset)
1976bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, SDValue Addr,
1977 SDValue &SAddr, SDValue &VOffset,
1978 SDValue &Offset, bool &ScaleOffset,
1979 bool NeedIOffset) const {
1980 int64_t ImmOffset = 0;
1981 ScaleOffset = false;
1982
1983 // Match the immediate offset first, which canonically is moved as low as
1984 // possible.
1985
1986 SDValue LHS, RHS;
1987 if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
1988 int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
1989 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1990
1991 if (NeedIOffset &&
1992 TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS,
1994 Addr = LHS;
1995 ImmOffset = COffsetVal;
1996 } else if (!LHS->isDivergent()) {
1997 if (COffsetVal > 0) {
1998 SDLoc SL(N);
1999 // saddr + large_offset -> saddr +
2000 // (voffset = large_offset & ~MaxOffset) +
2001 // (large_offset & MaxOffset);
2002 int64_t SplitImmOffset = 0, RemainderOffset = COffsetVal;
2003 if (NeedIOffset) {
2004 std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
2006 }
2007
2008 if (Subtarget->hasSignedGVSOffset() ? isInt<32>(RemainderOffset)
2009 : isUInt<32>(RemainderOffset)) {
2010 SDNode *VMov = CurDAG->getMachineNode(
2011 AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
2012 CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
2013 VOffset = SDValue(VMov, 0);
2014 SAddr = LHS;
2015 Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i32);
2016 return true;
2017 }
2018 }
2019
2020 // We are adding a 64 bit SGPR and a constant. If constant bus limit
2021 // is 1 we would need to perform 1 or 2 extra moves for each half of
2022 // the constant and it is better to do a scalar add and then issue a
2023 // single VALU instruction to materialize zero. Otherwise it is less
2024 // instructions to perform VALU adds with immediates or inline literals.
2025 unsigned NumLiterals =
2026 !TII->isInlineConstant(APInt(32, Lo_32(COffsetVal))) +
2027 !TII->isInlineConstant(APInt(32, Hi_32(COffsetVal)));
2028 if (Subtarget->getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
2029 return false;
2030 }
2031 }
2032
2033 // Match the variable offset.
2034 if (Addr->isAnyAdd()) {
2035 LHS = Addr.getOperand(0);
2036
2037 if (!LHS->isDivergent()) {
2038 // add (i64 sgpr), (*_extend (i32 vgpr))
2039 RHS = Addr.getOperand(1);
2040 ScaleOffset = SelectScaleOffset(N, RHS, Subtarget->hasSignedGVSOffset());
2041 if (SDValue ExtRHS = matchExtFromI32orI32(
2042 RHS, Subtarget->hasSignedGVSOffset(), CurDAG)) {
2043 SAddr = LHS;
2044 VOffset = ExtRHS;
2045 }
2046 }
2047
2048 RHS = Addr.getOperand(1);
2049 if (!SAddr && !RHS->isDivergent()) {
2050 // add (*_extend (i32 vgpr)), (i64 sgpr)
2051 ScaleOffset = SelectScaleOffset(N, LHS, Subtarget->hasSignedGVSOffset());
2052 if (SDValue ExtLHS = matchExtFromI32orI32(
2053 LHS, Subtarget->hasSignedGVSOffset(), CurDAG)) {
2054 SAddr = RHS;
2055 VOffset = ExtLHS;
2056 }
2057 }
2058
2059 if (SAddr) {
2060 Offset = CurDAG->getSignedTargetConstant(ImmOffset, SDLoc(), MVT::i32);
2061 return true;
2062 }
2063 }
2064
2065 if (Subtarget->hasScaleOffset() &&
2066 (Addr.getOpcode() == (Subtarget->hasSignedGVSOffset()
2069 (Addr.getOpcode() == AMDGPUISD::MAD_U64_U32 &&
2070 CurDAG->SignBitIsZero(Addr.getOperand(0)))) &&
2071 Addr.getOperand(0)->isDivergent() &&
2073 !Addr.getOperand(2)->isDivergent()) {
2074 // mad_u64_u32 (i32 vgpr), (i32 c), (i64 sgpr)
2075 unsigned Size =
2076 (unsigned)cast<MemSDNode>(N)->getMemoryVT().getFixedSizeInBits() / 8;
2077 ScaleOffset = Addr.getConstantOperandVal(1) == Size;
2078 if (ScaleOffset) {
2079 SAddr = Addr.getOperand(2);
2080 VOffset = Addr.getOperand(0);
2081 Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i32);
2082 return true;
2083 }
2084 }
2085
2086 if (Addr->isDivergent() || Addr.getOpcode() == ISD::UNDEF ||
2087 isa<ConstantSDNode>(Addr))
2088 return false;
2089
2090 // It's cheaper to materialize a single 32-bit zero for vaddr than the two
2091 // moves required to copy a 64-bit SGPR to VGPR.
2092 SAddr = Addr;
2093 SDNode *VMov =
2094 CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, SDLoc(Addr), MVT::i32,
2095 CurDAG->getTargetConstant(0, SDLoc(), MVT::i32));
2096 VOffset = SDValue(VMov, 0);
2097 Offset = CurDAG->getSignedTargetConstant(ImmOffset, SDLoc(), MVT::i32);
2098 return true;
2099}
2100
2101bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, SDValue Addr,
2102 SDValue &SAddr, SDValue &VOffset,
2103 SDValue &Offset,
2104 SDValue &CPol) const {
2105 bool ScaleOffset;
2106 if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset))
2107 return false;
2108
2109 CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0,
2110 SDLoc(), MVT::i32);
2111 return true;
2112}
2113
2114bool AMDGPUDAGToDAGISel::SelectGlobalSAddrCPol(SDNode *N, SDValue Addr,
2115 SDValue &SAddr, SDValue &VOffset,
2116 SDValue &Offset,
2117 SDValue &CPol) const {
2118 bool ScaleOffset;
2119 if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset))
2120 return false;
2121
2122 // We are assuming CPol is always the last operand of the intrinsic.
2123 auto PassedCPol =
2124 N->getConstantOperandVal(N->getNumOperands() - 1) & ~AMDGPU::CPol::SCAL;
2125 CPol = CurDAG->getTargetConstant(
2126 (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, SDLoc(), MVT::i32);
2127 return true;
2128}
2129
2130bool AMDGPUDAGToDAGISel::SelectGlobalSAddrCPolM0(SDNode *N, SDValue Addr,
2131 SDValue &SAddr,
2132 SDValue &VOffset,
2133 SDValue &Offset,
2134 SDValue &CPol) const {
2135 bool ScaleOffset;
2136 if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset))
2137 return false;
2138
2139 // We are assuming CPol is second from last operand of the intrinsic.
2140 auto PassedCPol =
2141 N->getConstantOperandVal(N->getNumOperands() - 2) & ~AMDGPU::CPol::SCAL;
2142 CPol = CurDAG->getTargetConstant(
2143 (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, SDLoc(), MVT::i32);
2144 return true;
2145}
2146
2147bool AMDGPUDAGToDAGISel::SelectGlobalSAddrGLC(SDNode *N, SDValue Addr,
2148 SDValue &SAddr, SDValue &VOffset,
2149 SDValue &Offset,
2150 SDValue &CPol) const {
2151 bool ScaleOffset;
2152 if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset))
2153 return false;
2154
2155 unsigned CPolVal = (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | AMDGPU::CPol::GLC;
2156 CPol = CurDAG->getTargetConstant(CPolVal, SDLoc(), MVT::i32);
2157 return true;
2158}
2159
2160bool AMDGPUDAGToDAGISel::SelectGlobalSAddrNoIOffset(SDNode *N, SDValue Addr,
2161 SDValue &SAddr,
2162 SDValue &VOffset,
2163 SDValue &CPol) const {
2164 bool ScaleOffset;
2165 SDValue DummyOffset;
2166 if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, DummyOffset, ScaleOffset,
2167 false))
2168 return false;
2169
2170 // We are assuming CPol is always the last operand of the intrinsic.
2171 auto PassedCPol =
2172 N->getConstantOperandVal(N->getNumOperands() - 1) & ~AMDGPU::CPol::SCAL;
2173 CPol = CurDAG->getTargetConstant(
2174 (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, SDLoc(), MVT::i32);
2175 return true;
2176}
2177
2178bool AMDGPUDAGToDAGISel::SelectGlobalSAddrNoIOffsetM0(SDNode *N, SDValue Addr,
2179 SDValue &SAddr,
2180 SDValue &VOffset,
2181 SDValue &CPol) const {
2182 bool ScaleOffset;
2183 SDValue DummyOffset;
2184 if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, DummyOffset, ScaleOffset,
2185 false))
2186 return false;
2187
2188 // We are assuming CPol is second from last operand of the intrinsic.
2189 auto PassedCPol =
2190 N->getConstantOperandVal(N->getNumOperands() - 2) & ~AMDGPU::CPol::SCAL;
2191 CPol = CurDAG->getTargetConstant(
2192 (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, SDLoc(), MVT::i32);
2193 return true;
2194}
2195
2197 if (auto *FI = dyn_cast<FrameIndexSDNode>(SAddr)) {
2198 SAddr = CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0));
2199 } else if (SAddr.getOpcode() == ISD::ADD &&
2201 // Materialize this into a scalar move for scalar address to avoid
2202 // readfirstlane.
2203 auto *FI = cast<FrameIndexSDNode>(SAddr.getOperand(0));
2204 SDValue TFI = CurDAG->getTargetFrameIndex(FI->getIndex(),
2205 FI->getValueType(0));
2206 SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, SDLoc(SAddr),
2207 MVT::i32, TFI, SAddr.getOperand(1)),
2208 0);
2209 }
2210
2211 return SAddr;
2212}
2213
2214// Match (32-bit SGPR base) + sext(imm offset)
2215bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr,
2216 SDValue &SAddr,
2217 SDValue &Offset) const {
2218 if (Addr->isDivergent())
2219 return false;
2220
2221 SDLoc DL(Addr);
2222
2223 int64_t COffsetVal = 0;
2224
2225 if (CurDAG->isBaseWithConstantOffset(Addr) && isFlatScratchBaseLegal(Addr)) {
2226 COffsetVal = cast<ConstantSDNode>(Addr.getOperand(1))->getSExtValue();
2227 SAddr = Addr.getOperand(0);
2228 } else {
2229 SAddr = Addr;
2230 }
2231
2232 SAddr = SelectSAddrFI(CurDAG, SAddr);
2233
2234 const SIInstrInfo *TII = Subtarget->getInstrInfo();
2235
2236 if (!TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS,
2238 int64_t SplitImmOffset, RemainderOffset;
2239 std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
2241
2242 COffsetVal = SplitImmOffset;
2243
2244 SDValue AddOffset =
2246 ? getMaterializedScalarImm32(Lo_32(RemainderOffset), DL)
2247 : CurDAG->getSignedTargetConstant(RemainderOffset, DL, MVT::i32);
2248 SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, DL, MVT::i32,
2249 SAddr, AddOffset),
2250 0);
2251 }
2252
2253 Offset = CurDAG->getSignedTargetConstant(COffsetVal, DL, MVT::i32);
2254
2255 return true;
2256}
2257
2258// Check whether the flat scratch SVS swizzle bug affects this access.
2259bool AMDGPUDAGToDAGISel::checkFlatScratchSVSSwizzleBug(
2260 SDValue VAddr, SDValue SAddr, uint64_t ImmOffset) const {
2261 if (!Subtarget->hasFlatScratchSVSSwizzleBug())
2262 return false;
2263
2264 // The bug affects the swizzling of SVS accesses if there is any carry out
2265 // from the two low order bits (i.e. from bit 1 into bit 2) when adding
2266 // voffset to (soffset + inst_offset).
2267 KnownBits VKnown = CurDAG->computeKnownBits(VAddr);
2268 KnownBits SKnown =
2269 KnownBits::add(CurDAG->computeKnownBits(SAddr),
2270 KnownBits::makeConstant(APInt(32, ImmOffset,
2271 /*isSigned=*/true)));
2272 uint64_t VMax = VKnown.getMaxValue().getZExtValue();
2273 uint64_t SMax = SKnown.getMaxValue().getZExtValue();
2274 return (VMax & 3) + (SMax & 3) >= 4;
2275}
2276
2277bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
2278 SDValue &VAddr, SDValue &SAddr,
2279 SDValue &Offset,
2280 SDValue &CPol) const {
2281 int64_t ImmOffset = 0;
2282
2283 SDValue LHS, RHS;
2284 SDValue OrigAddr = Addr;
2285 if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
2286 int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
2287 const SIInstrInfo *TII = Subtarget->getInstrInfo();
2288
2289 if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS,
2291 Addr = LHS;
2292 ImmOffset = COffsetVal;
2293 } else if (!LHS->isDivergent() && COffsetVal > 0) {
2294 SDLoc SL(N);
2295 // saddr + large_offset -> saddr + (vaddr = large_offset & ~MaxOffset) +
2296 // (large_offset & MaxOffset);
2297 int64_t SplitImmOffset, RemainderOffset;
2298 std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
2300
2301 if (isUInt<32>(RemainderOffset)) {
2302 SDNode *VMov = CurDAG->getMachineNode(
2303 AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
2304 CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
2305 VAddr = SDValue(VMov, 0);
2306 SAddr = LHS;
2307 if (!isFlatScratchBaseLegal(Addr))
2308 return false;
2309 if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, SplitImmOffset))
2310 return false;
2311 Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i32);
2312 CPol = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32);
2313 return true;
2314 }
2315 }
2316 }
2317
2318 if (Addr.getOpcode() != ISD::ADD)
2319 return false;
2320
2321 LHS = Addr.getOperand(0);
2322 RHS = Addr.getOperand(1);
2323
2324 if (!LHS->isDivergent() && RHS->isDivergent()) {
2325 SAddr = LHS;
2326 VAddr = RHS;
2327 } else if (!RHS->isDivergent() && LHS->isDivergent()) {
2328 SAddr = RHS;
2329 VAddr = LHS;
2330 } else {
2331 return false;
2332 }
2333
2334 if (OrigAddr != Addr) {
2335 if (!isFlatScratchBaseLegalSVImm(OrigAddr))
2336 return false;
2337 } else {
2338 if (!isFlatScratchBaseLegalSV(OrigAddr))
2339 return false;
2340 }
2341
2342 if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, ImmOffset))
2343 return false;
2344 SAddr = SelectSAddrFI(CurDAG, SAddr);
2345 Offset = CurDAG->getSignedTargetConstant(ImmOffset, SDLoc(), MVT::i32);
2346
2347 bool ScaleOffset = SelectScaleOffset(N, VAddr, true /* IsSigned */);
2348 CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0,
2349 SDLoc(), MVT::i32);
2350 return true;
2351}
2352
2353// For unbuffered smem loads, it is illegal for the Immediate Offset to be
2354// negative if the resulting (Offset + (M0 or SOffset or zero) is negative.
2355// Handle the case where the Immediate Offset + SOffset is negative.
2356bool AMDGPUDAGToDAGISel::isSOffsetLegalWithImmOffset(SDValue *SOffset,
2357 bool Imm32Only,
2358 bool IsBuffer,
2359 int64_t ImmOffset) const {
2360 if (!IsBuffer && !Imm32Only && ImmOffset < 0 &&
2361 AMDGPU::hasSMRDSignedImmOffset(*Subtarget)) {
2362 KnownBits SKnown = CurDAG->computeKnownBits(*SOffset);
2363 if (ImmOffset + SKnown.getMinValue().getSExtValue() < 0)
2364 return false;
2365 }
2366
2367 return true;
2368}
2369
2370// Given \p Offset and load node \p N check if an \p Offset is a multiple of
2371// the load byte size. If it is update \p Offset to a pre-scaled value and
2372// return true.
2373bool AMDGPUDAGToDAGISel::SelectScaleOffset(SDNode *N, SDValue &Offset,
2374 bool IsSigned) const {
2375 bool ScaleOffset = false;
2376 if (!Subtarget->hasScaleOffset() || !Offset)
2377 return false;
2378
2379 unsigned Size =
2380 (unsigned)cast<MemSDNode>(N)->getMemoryVT().getFixedSizeInBits() / 8;
2381
2382 SDValue Off = Offset;
2383 if (SDValue Ext = matchExtFromI32orI32(Offset, IsSigned, CurDAG))
2384 Off = Ext;
2385
2386 if (isPowerOf2_32(Size) && Off.getOpcode() == ISD::SHL) {
2387 if (auto *C = dyn_cast<ConstantSDNode>(Off.getOperand(1)))
2388 ScaleOffset = C->getZExtValue() == Log2_32(Size);
2389 } else if (Offset.getOpcode() == ISD::MUL ||
2390 (IsSigned && Offset.getOpcode() == AMDGPUISD::MUL_I24) ||
2391 Offset.getOpcode() == AMDGPUISD::MUL_U24 ||
2392 (Offset.isMachineOpcode() &&
2393 Offset.getMachineOpcode() ==
2394 (IsSigned ? AMDGPU::S_MUL_I64_I32_PSEUDO
2395 : AMDGPU::S_MUL_U64_U32_PSEUDO))) {
2396 if (auto *C = dyn_cast<ConstantSDNode>(Offset.getOperand(1)))
2397 ScaleOffset = C->getZExtValue() == Size;
2398 }
2399
2400 if (ScaleOffset)
2401 Offset = Off.getOperand(0);
2402
2403 return ScaleOffset;
2404}
2405
2406// Match an immediate (if Offset is not null) or an SGPR (if SOffset is
2407// not null) offset. If Imm32Only is true, match only 32-bit immediate
2408// offsets available on CI.
2409bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDNode *N, SDValue ByteOffsetNode,
2410 SDValue *SOffset, SDValue *Offset,
2411 bool Imm32Only, bool IsBuffer,
2412 bool HasSOffset, int64_t ImmOffset,
2413 bool *ScaleOffset) const {
2414 assert((!SOffset || !Offset) &&
2415 "Cannot match both soffset and offset at the same time!");
2416
2417 if (ScaleOffset) {
2418 assert(N && SOffset);
2419
2420 *ScaleOffset = SelectScaleOffset(N, ByteOffsetNode, false /* IsSigned */);
2421 }
2422
2423 ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode);
2424 if (!C) {
2425 if (!SOffset)
2426 return false;
2427
2428 if (ByteOffsetNode.getValueType().isScalarInteger() &&
2429 ByteOffsetNode.getValueType().getSizeInBits() == 32) {
2430 *SOffset = ByteOffsetNode;
2431 return isSOffsetLegalWithImmOffset(SOffset, Imm32Only, IsBuffer,
2432 ImmOffset);
2433 }
2434 if (ByteOffsetNode.getOpcode() == ISD::ZERO_EXTEND) {
2435 if (ByteOffsetNode.getOperand(0).getValueType().getSizeInBits() == 32) {
2436 *SOffset = ByteOffsetNode.getOperand(0);
2437 return isSOffsetLegalWithImmOffset(SOffset, Imm32Only, IsBuffer,
2438 ImmOffset);
2439 }
2440 }
2441 return false;
2442 }
2443
2444 SDLoc SL(ByteOffsetNode);
2445
2446 // GFX9 and GFX10 have signed byte immediate offsets. The immediate
2447 // offset for S_BUFFER instructions is unsigned.
2448 int64_t ByteOffset = IsBuffer ? C->getZExtValue() : C->getSExtValue();
2449 std::optional<int64_t> EncodedOffset = AMDGPU::getSMRDEncodedOffset(
2450 *Subtarget, ByteOffset, IsBuffer, HasSOffset);
2451 if (EncodedOffset && Offset && !Imm32Only) {
2452 *Offset = CurDAG->getSignedTargetConstant(*EncodedOffset, SL, MVT::i32);
2453 return true;
2454 }
2455
2456 // SGPR and literal offsets are unsigned.
2457 if (ByteOffset < 0)
2458 return false;
2459
2460 EncodedOffset = AMDGPU::getSMRDEncodedLiteralOffset32(*Subtarget, ByteOffset);
2461 if (EncodedOffset && Offset && Imm32Only) {
2462 *Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);
2463 return true;
2464 }
2465
2466 if (!isUInt<32>(ByteOffset) && !isInt<32>(ByteOffset))
2467 return false;
2468
2469 if (SOffset) {
2470 SDValue C32Bit = CurDAG->getTargetConstant(ByteOffset, SL, MVT::i32);
2471 *SOffset = SDValue(
2472 CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, C32Bit), 0);
2473 return true;
2474 }
2475
2476 return false;
2477}
2478
2479SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const {
2480 if (Addr.getValueType() != MVT::i32)
2481 return Addr;
2482
2483 // Zero-extend a 32-bit address.
2484 SDLoc SL(Addr);
2485
2486 const MachineFunction &MF = CurDAG->getMachineFunction();
2487 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2488 unsigned AddrHiVal = Info->get32BitAddressHighBits();
2489 SDValue AddrHi = CurDAG->getTargetConstant(AddrHiVal, SL, MVT::i32);
2490
2491 const SDValue Ops[] = {
2492 CurDAG->getTargetConstant(AMDGPU::SReg_64_XEXECRegClassID, SL, MVT::i32),
2493 Addr,
2494 CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
2495 SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, AddrHi),
2496 0),
2497 CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32),
2498 };
2499
2500 return SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, SL, MVT::i64,
2501 Ops), 0);
2502}
2503
2504// Match a base and an immediate (if Offset is not null) or an SGPR (if
2505// SOffset is not null) or an immediate+SGPR offset. If Imm32Only is
2506// true, match only 32-bit immediate offsets available on CI.
2507bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDNode *N, SDValue Addr,
2508 SDValue &SBase, SDValue *SOffset,
2509 SDValue *Offset, bool Imm32Only,
2510 bool IsBuffer, bool HasSOffset,
2511 int64_t ImmOffset,
2512 bool *ScaleOffset) const {
2513 if (SOffset && Offset) {
2514 assert(!Imm32Only && !IsBuffer);
2515 SDValue B;
2516
2517 if (!SelectSMRDBaseOffset(N, Addr, B, nullptr, Offset, false, false, true))
2518 return false;
2519
2520 int64_t ImmOff = 0;
2521 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(*Offset))
2522 ImmOff = C->getSExtValue();
2523
2524 return SelectSMRDBaseOffset(N, B, SBase, SOffset, nullptr, false, false,
2525 true, ImmOff, ScaleOffset);
2526 }
2527
2528 // A 32-bit (address + offset) should not cause unsigned 32-bit integer
2529 // wraparound, because s_load instructions perform the addition in 64 bits.
2530 if (Addr.getValueType() == MVT::i32 && Addr.getOpcode() == ISD::ADD &&
2531 !Addr->getFlags().hasNoUnsignedWrap())
2532 return false;
2533
2534 SDValue N0, N1;
2535 // Extract the base and offset if possible.
2536 if (Addr->isAnyAdd() || CurDAG->isADDLike(Addr)) {
2537 N0 = Addr.getOperand(0);
2538 N1 = Addr.getOperand(1);
2539 } else if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, N0, N1)) {
2540 assert(N0 && N1 && isa<ConstantSDNode>(N1));
2541 }
2542 if (!N0 || !N1)
2543 return false;
2544
2545 if (SelectSMRDOffset(N, N1, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
2546 ImmOffset, ScaleOffset)) {
2547 SBase = N0;
2548 return true;
2549 }
2550 if (SelectSMRDOffset(N, N0, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
2551 ImmOffset, ScaleOffset)) {
2552 SBase = N1;
2553 return true;
2554 }
2555 return false;
2556}
2557
2558bool AMDGPUDAGToDAGISel::SelectSMRD(SDNode *N, SDValue Addr, SDValue &SBase,
2559 SDValue *SOffset, SDValue *Offset,
2560 bool Imm32Only, bool *ScaleOffset) const {
2561 if (SelectSMRDBaseOffset(N, Addr, SBase, SOffset, Offset, Imm32Only,
2562 /* IsBuffer */ false, /* HasSOffset */ false,
2563 /* ImmOffset */ 0, ScaleOffset)) {
2564 SBase = Expand32BitAddress(SBase);
2565 return true;
2566 }
2567
2568 if (Addr.getValueType() == MVT::i32 && Offset && !SOffset) {
2569 SBase = Expand32BitAddress(Addr);
2570 *Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
2571 return true;
2572 }
2573
2574 return false;
2575}
2576
2577bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr, SDValue &SBase,
2578 SDValue &Offset) const {
2579 return SelectSMRD(/* N */ nullptr, Addr, SBase, /* SOffset */ nullptr,
2580 &Offset);
2581}
2582
2583bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase,
2584 SDValue &Offset) const {
2585 assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
2586 return SelectSMRD(/* N */ nullptr, Addr, SBase, /* SOffset */ nullptr,
2587 &Offset, /* Imm32Only */ true);
2588}
2589
2590bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDNode *N, SDValue Addr, SDValue &SBase,
2591 SDValue &SOffset, SDValue &CPol) const {
2592 bool ScaleOffset;
2593 if (!SelectSMRD(N, Addr, SBase, &SOffset, /* Offset */ nullptr,
2594 /* Imm32Only */ false, &ScaleOffset))
2595 return false;
2596
2597 CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0,
2598 SDLoc(N), MVT::i32);
2599 return true;
2600}
2601
2602bool AMDGPUDAGToDAGISel::SelectSMRDSgprImm(SDNode *N, SDValue Addr,
2603 SDValue &SBase, SDValue &SOffset,
2604 SDValue &Offset,
2605 SDValue &CPol) const {
2606 bool ScaleOffset;
2607 if (!SelectSMRD(N, Addr, SBase, &SOffset, &Offset, false, &ScaleOffset))
2608 return false;
2609
2610 CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0,
2611 SDLoc(N), MVT::i32);
2612 return true;
2613}
2614
2615bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue N, SDValue &Offset) const {
2616 return SelectSMRDOffset(/* N */ nullptr, N, /* SOffset */ nullptr, &Offset,
2617 /* Imm32Only */ false, /* IsBuffer */ true);
2618}
2619
2620bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue N,
2621 SDValue &Offset) const {
2622 assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
2623 return SelectSMRDOffset(/* N */ nullptr, N, /* SOffset */ nullptr, &Offset,
2624 /* Imm32Only */ true, /* IsBuffer */ true);
2625}
2626
2627bool AMDGPUDAGToDAGISel::SelectSMRDBufferSgprImm(SDValue N, SDValue &SOffset,
2628 SDValue &Offset) const {
2629 // Match the (soffset + offset) pair as a 32-bit register base and
2630 // an immediate offset.
2631 return N.getValueType() == MVT::i32 &&
2632 SelectSMRDBaseOffset(/* N */ nullptr, N, /* SBase */ SOffset,
2633 /* SOffset*/ nullptr, &Offset,
2634 /* Imm32Only */ false, /* IsBuffer */ true);
2635}
2636
2637bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index,
2638 SDValue &Base,
2639 SDValue &Offset) const {
2640 SDLoc DL(Index);
2641
2642 if (CurDAG->isBaseWithConstantOffset(Index)) {
2643 SDValue N0 = Index.getOperand(0);
2644 SDValue N1 = Index.getOperand(1);
2645 ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
2646
2647 // (add n0, c0)
2648 // Don't peel off the offset (c0) if doing so could possibly lead
2649 // the base (n0) to be negative.
2650 // (or n0, |c0|) can never change a sign given isBaseWithConstantOffset.
2651 if (C1->getSExtValue() <= 0 || CurDAG->SignBitIsZero(N0) ||
2652 (Index->getOpcode() == ISD::OR && C1->getSExtValue() >= 0)) {
2653 Base = N0;
2654 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
2655 return true;
2656 }
2657 }
2658
2659 if (isa<ConstantSDNode>(Index))
2660 return false;
2661
2662 Base = Index;
2663 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
2664 return true;
2665}
2666
2667SDNode *AMDGPUDAGToDAGISel::getBFE32(bool IsSigned, const SDLoc &DL,
2668 SDValue Val, uint32_t Offset,
2669 uint32_t Width) {
2670 if (Val->isDivergent()) {
2671 unsigned Opcode = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2672 SDValue Off = CurDAG->getTargetConstant(Offset, DL, MVT::i32);
2673 SDValue W = CurDAG->getTargetConstant(Width, DL, MVT::i32);
2674
2675 return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, Off, W);
2676 }
2677 unsigned Opcode = IsSigned ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2678 // Transformation function, pack the offset and width of a BFE into
2679 // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
2680 // source, bits [5:0] contain the offset and bits [22:16] the width.
2681 uint32_t PackedVal = Offset | (Width << 16);
2682 SDValue PackedConst = CurDAG->getTargetConstant(PackedVal, DL, MVT::i32);
2683
2684 return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, PackedConst);
2685}
2686
2687void AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode *N) {
2688 // "(a << b) srl c)" ---> "BFE_U32 a, (c-b), (32-c)
2689 // "(a << b) sra c)" ---> "BFE_I32 a, (c-b), (32-c)
2690 // Predicate: 0 < b <= c < 32
2691
2692 const SDValue &Shl = N->getOperand(0);
2693 ConstantSDNode *B = dyn_cast<ConstantSDNode>(Shl->getOperand(1));
2694 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
2695
2696 if (B && C) {
2697 uint32_t BVal = B->getZExtValue();
2698 uint32_t CVal = C->getZExtValue();
2699
2700 if (0 < BVal && BVal <= CVal && CVal < 32) {
2701 bool Signed = N->getOpcode() == ISD::SRA;
2702 ReplaceNode(N, getBFE32(Signed, SDLoc(N), Shl.getOperand(0), CVal - BVal,
2703 32 - CVal));
2704 return;
2705 }
2706 }
2707 SelectCode(N);
2708}
2709
2710void AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) {
2711 switch (N->getOpcode()) {
2712 case ISD::AND:
2713 if (N->getOperand(0).getOpcode() == ISD::SRL) {
2714 // "(a srl b) & mask" ---> "BFE_U32 a, b, popcount(mask)"
2715 // Predicate: isMask(mask)
2716 const SDValue &Srl = N->getOperand(0);
2717 ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(Srl.getOperand(1));
2718 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(N->getOperand(1));
2719
2720 if (Shift && Mask) {
2721 uint32_t ShiftVal = Shift->getZExtValue();
2722 uint32_t MaskVal = Mask->getZExtValue();
2723
2724 if (isMask_32(MaskVal)) {
2725 uint32_t WidthVal = llvm::popcount(MaskVal);
2726 ReplaceNode(N, getBFE32(false, SDLoc(N), Srl.getOperand(0), ShiftVal,
2727 WidthVal));
2728 return;
2729 }
2730 }
2731 }
2732 break;
2733 case ISD::SRL:
2734 if (N->getOperand(0).getOpcode() == ISD::AND) {
2735 // "(a & mask) srl b)" ---> "BFE_U32 a, b, popcount(mask >> b)"
2736 // Predicate: isMask(mask >> b)
2737 const SDValue &And = N->getOperand(0);
2738 ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(N->getOperand(1));
2739 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(And->getOperand(1));
2740
2741 if (Shift && Mask) {
2742 uint32_t ShiftVal = Shift->getZExtValue();
2743 uint32_t MaskVal = Mask->getZExtValue() >> ShiftVal;
2744
2745 if (isMask_32(MaskVal)) {
2746 uint32_t WidthVal = llvm::popcount(MaskVal);
2747 ReplaceNode(N, getBFE32(false, SDLoc(N), And.getOperand(0), ShiftVal,
2748 WidthVal));
2749 return;
2750 }
2751 }
2752 } else if (N->getOperand(0).getOpcode() == ISD::SHL) {
2753 SelectS_BFEFromShifts(N);
2754 return;
2755 }
2756 break;
2757 case ISD::SRA:
2758 if (N->getOperand(0).getOpcode() == ISD::SHL) {
2759 SelectS_BFEFromShifts(N);
2760 return;
2761 }
2762 break;
2763
2765 // sext_inreg (srl x, 16), i8 -> bfe_i32 x, 16, 8
2766 SDValue Src = N->getOperand(0);
2767 if (Src.getOpcode() != ISD::SRL)
2768 break;
2769
2770 const ConstantSDNode *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(1));
2771 if (!Amt)
2772 break;
2773
2774 unsigned Width = cast<VTSDNode>(N->getOperand(1))->getVT().getSizeInBits();
2775 ReplaceNode(N, getBFE32(true, SDLoc(N), Src.getOperand(0),
2776 Amt->getZExtValue(), Width));
2777 return;
2778 }
2779 }
2780
2781 SelectCode(N);
2782}
2783
2784bool AMDGPUDAGToDAGISel::isCBranchSCC(const SDNode *N) const {
2785 assert(N->getOpcode() == ISD::BRCOND);
2786 if (!N->hasOneUse())
2787 return false;
2788
2789 SDValue Cond = N->getOperand(1);
2790 if (Cond.getOpcode() == ISD::CopyToReg)
2791 Cond = Cond.getOperand(2);
2792
2793 if (Cond.getOpcode() != ISD::SETCC || !Cond.hasOneUse())
2794 return false;
2795
2796 MVT VT = Cond.getOperand(0).getSimpleValueType();
2797 if (VT == MVT::i32)
2798 return true;
2799
2800 if (VT == MVT::i64) {
2801 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
2802 return (CC == ISD::SETEQ || CC == ISD::SETNE) &&
2803 Subtarget->hasScalarCompareEq64();
2804 }
2805
2806 if ((VT == MVT::f16 || VT == MVT::f32) && Subtarget->hasSALUFloatInsts())
2807 return true;
2808
2809 return false;
2810}
2811
2812static SDValue combineBallotPattern(SDValue VCMP, bool &Negate) {
2813 assert(VCMP->getOpcode() == AMDGPUISD::SETCC);
2814 // Special case for amdgcn.ballot:
2815 // %Cond = i1 (and/or combination of i1 ISD::SETCCs)
2816 // %VCMP = i(WaveSize) AMDGPUISD::SETCC (ext %Cond), 0, setne/seteq
2817 // =>
2818 // Use i1 %Cond value instead of i(WaveSize) %VCMP.
2819 // This is possible because divergent ISD::SETCC is selected as V_CMP and
2820 // Cond becomes a i(WaveSize) full mask value.
2821 // Note that ballot doesn't use SETEQ condition but its easy to support it
2822 // here for completeness, so in this case Negate is set true on return.
2823 auto VCMP_CC = cast<CondCodeSDNode>(VCMP.getOperand(2))->get();
2824 if ((VCMP_CC == ISD::SETEQ || VCMP_CC == ISD::SETNE) &&
2825 isNullConstant(VCMP.getOperand(1))) {
2826
2827 auto Cond = VCMP.getOperand(0);
2828 if (ISD::isExtOpcode(Cond->getOpcode())) // Skip extension.
2829 Cond = Cond.getOperand(0);
2830
2831 if (isBoolSGPR(Cond)) {
2832 Negate = VCMP_CC == ISD::SETEQ;
2833 return Cond;
2834 }
2835 }
2836 return SDValue();
2837}
2838
2839void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
2840 SDValue Cond = N->getOperand(1);
2841
2842 if (Cond.isUndef()) {
2843 CurDAG->SelectNodeTo(N, AMDGPU::SI_BR_UNDEF, MVT::Other,
2844 N->getOperand(2), N->getOperand(0));
2845 return;
2846 }
2847
2848 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
2849
2850 bool UseSCCBr = isCBranchSCC(N) && isUniformBr(N);
2851 bool AndExec = !UseSCCBr;
2852 bool Negate = false;
2853
2854 if (Cond.getOpcode() == ISD::SETCC &&
2855 Cond->getOperand(0)->getOpcode() == AMDGPUISD::SETCC) {
2856 SDValue VCMP = Cond->getOperand(0);
2857 auto CC = cast<CondCodeSDNode>(Cond->getOperand(2))->get();
2858 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
2859 isNullConstant(Cond->getOperand(1)) &&
2860 // We may encounter ballot.i64 in wave32 mode on -O0.
2861 VCMP.getValueType().getSizeInBits() == Subtarget->getWavefrontSize()) {
2862 // %VCMP = i(WaveSize) AMDGPUISD::SETCC ...
2863 // %C = i1 ISD::SETCC %VCMP, 0, setne/seteq
2864 // BRCOND i1 %C, %BB
2865 // =>
2866 // %VCMP = i(WaveSize) AMDGPUISD::SETCC ...
2867 // VCC = COPY i(WaveSize) %VCMP
2868 // S_CBRANCH_VCCNZ/VCCZ %BB
2869 Negate = CC == ISD::SETEQ;
2870 bool NegatedBallot = false;
2871 if (auto BallotCond = combineBallotPattern(VCMP, NegatedBallot)) {
2872 Cond = BallotCond;
2873 UseSCCBr = !BallotCond->isDivergent();
2874 Negate = Negate ^ NegatedBallot;
2875 } else {
2876 // TODO: don't use SCC here assuming that AMDGPUISD::SETCC is always
2877 // selected as V_CMP, but this may change for uniform condition.
2878 Cond = VCMP;
2879 UseSCCBr = false;
2880 }
2881 }
2882 // Cond is either V_CMP resulted from AMDGPUISD::SETCC or a combination of
2883 // V_CMPs resulted from ballot or ballot has uniform condition and SCC is
2884 // used.
2885 AndExec = false;
2886 }
2887
2888 unsigned BrOp =
2889 UseSCCBr ? (Negate ? AMDGPU::S_CBRANCH_SCC0 : AMDGPU::S_CBRANCH_SCC1)
2890 : (Negate ? AMDGPU::S_CBRANCH_VCCZ : AMDGPU::S_CBRANCH_VCCNZ);
2891 Register CondReg = UseSCCBr ? AMDGPU::SCC : TRI->getVCC();
2892 SDLoc SL(N);
2893
2894 if (AndExec) {
2895 // This is the case that we are selecting to S_CBRANCH_VCCNZ. We have not
2896 // analyzed what generates the vcc value, so we do not know whether vcc
2897 // bits for disabled lanes are 0. Thus we need to mask out bits for
2898 // disabled lanes.
2899 //
2900 // For the case that we select S_CBRANCH_SCC1 and it gets
2901 // changed to S_CBRANCH_VCCNZ in SIFixSGPRCopies, SIFixSGPRCopies calls
2902 // SIInstrInfo::moveToVALU which inserts the S_AND).
2903 //
2904 // We could add an analysis of what generates the vcc value here and omit
2905 // the S_AND when is unnecessary. But it would be better to add a separate
2906 // pass after SIFixSGPRCopies to do the unnecessary S_AND removal, so it
2907 // catches both cases.
2908 Cond = SDValue(
2909 CurDAG->getMachineNode(
2910 Subtarget->isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64, SL,
2911 MVT::i1,
2912 CurDAG->getRegister(Subtarget->isWave32() ? AMDGPU::EXEC_LO
2913 : AMDGPU::EXEC,
2914 MVT::i1),
2915 Cond),
2916 0);
2917 }
2918
2919 SDValue VCC = CurDAG->getCopyToReg(N->getOperand(0), SL, CondReg, Cond);
2920 CurDAG->SelectNodeTo(N, BrOp, MVT::Other,
2921 N->getOperand(2), // Basic Block
2922 VCC.getValue(0));
2923}
2924
2925void AMDGPUDAGToDAGISel::SelectFP_EXTEND(SDNode *N) {
2926 if (Subtarget->hasSALUFloatInsts() && N->getValueType(0) == MVT::f32 &&
2927 !N->isDivergent()) {
2928 SDValue Src = N->getOperand(0);
2929 if (Src.getValueType() == MVT::f16) {
2930 if (isExtractHiElt(Src, Src)) {
2931 CurDAG->SelectNodeTo(N, AMDGPU::S_CVT_HI_F32_F16, N->getVTList(),
2932 {Src});
2933 return;
2934 }
2935 }
2936 }
2937
2938 SelectCode(N);
2939}
2940
2941void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode *N, unsigned IntrID) {
2942 // The address is assumed to be uniform, so if it ends up in a VGPR, it will
2943 // be copied to an SGPR with readfirstlane.
2944 unsigned Opc = IntrID == Intrinsic::amdgcn_ds_append ?
2945 AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
2946
2947 SDValue Chain = N->getOperand(0);
2948 SDValue Ptr = N->getOperand(2);
2949 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2950 MachineMemOperand *MMO = M->getMemOperand();
2951 bool IsGDS = M->getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
2952
2954 if (CurDAG->isBaseWithConstantOffset(Ptr)) {
2955 SDValue PtrBase = Ptr.getOperand(0);
2956 SDValue PtrOffset = Ptr.getOperand(1);
2957
2958 const APInt &OffsetVal = PtrOffset->getAsAPIntVal();
2959 if (isDSOffsetLegal(PtrBase, OffsetVal.getZExtValue())) {
2960 N = glueCopyToM0(N, PtrBase);
2961 Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i32);
2962 }
2963 }
2964
2965 if (!Offset) {
2966 N = glueCopyToM0(N, Ptr);
2967 Offset = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32);
2968 }
2969
2970 SDValue Ops[] = {
2971 Offset,
2972 CurDAG->getTargetConstant(IsGDS, SDLoc(), MVT::i32),
2973 Chain,
2974 N->getOperand(N->getNumOperands() - 1) // New glue
2975 };
2976
2977 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
2978 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
2979}
2980
2981// We need to handle this here because tablegen doesn't support matching
2982// instructions with multiple outputs.
2983void AMDGPUDAGToDAGISel::SelectDSBvhStackIntrinsic(SDNode *N, unsigned IntrID) {
2984 unsigned Opc;
2985 switch (IntrID) {
2986 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2987 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
2988 Opc = AMDGPU::DS_BVH_STACK_RTN_B32;
2989 break;
2990 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
2991 Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP1_RTN_B32;
2992 break;
2993 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
2994 Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP2_RTN_B64;
2995 break;
2996 }
2997 SDValue Ops[] = {N->getOperand(2), N->getOperand(3), N->getOperand(4),
2998 N->getOperand(5), N->getOperand(0)};
2999
3000 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
3001 MachineMemOperand *MMO = M->getMemOperand();
3002 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
3003 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
3004}
3005
3006void AMDGPUDAGToDAGISel::SelectTensorLoadStore(SDNode *N, unsigned IntrID) {
3007 bool IsLoad = IntrID == Intrinsic::amdgcn_tensor_load_to_lds;
3008 unsigned Opc =
3009 IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS : AMDGPU::TENSOR_STORE_FROM_LDS;
3010
3011 SmallVector<SDValue, 7> TensorOps;
3012 // First two groups
3013 TensorOps.push_back(N->getOperand(2)); // D# group 0
3014 TensorOps.push_back(N->getOperand(3)); // D# group 1
3015
3016 // Use _D2 version if both group 2 and 3 are zero-initialized.
3017 SDValue Group2 = N->getOperand(4);
3018 SDValue Group3 = N->getOperand(5);
3019 if (ISD::isBuildVectorAllZeros(Group2.getNode()) &&
3021 Opc = IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS_D2
3022 : AMDGPU::TENSOR_STORE_FROM_LDS_D2;
3023 } else { // Has at least 4 groups
3024 TensorOps.push_back(Group2); // D# group 2
3025 TensorOps.push_back(Group3); // D# group 3
3026 }
3027
3028 // TODO: Handle the fifth group: N->getOperand(6), which is silently ignored
3029 // for now because all existing targets only support up to 4 groups.
3030 TensorOps.push_back(CurDAG->getTargetConstant(0, SDLoc(N), MVT::i1)); // r128
3031 TensorOps.push_back(N->getOperand(7)); // cache policy
3032 TensorOps.push_back(N->getOperand(0)); // chain
3033
3034 (void)CurDAG->SelectNodeTo(N, Opc, MVT::Other, TensorOps);
3035}
3036
3037static unsigned gwsIntrinToOpcode(unsigned IntrID) {
3038 switch (IntrID) {
3039 case Intrinsic::amdgcn_ds_gws_init:
3040 return AMDGPU::DS_GWS_INIT;
3041 case Intrinsic::amdgcn_ds_gws_barrier:
3042 return AMDGPU::DS_GWS_BARRIER;
3043 case Intrinsic::amdgcn_ds_gws_sema_v:
3044 return AMDGPU::DS_GWS_SEMA_V;
3045 case Intrinsic::amdgcn_ds_gws_sema_br:
3046 return AMDGPU::DS_GWS_SEMA_BR;
3047 case Intrinsic::amdgcn_ds_gws_sema_p:
3048 return AMDGPU::DS_GWS_SEMA_P;
3049 case Intrinsic::amdgcn_ds_gws_sema_release_all:
3050 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
3051 default:
3052 llvm_unreachable("not a gws intrinsic");
3053 }
3054}
3055
3056void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) {
3057 if (!Subtarget->hasGWS() ||
3058 (IntrID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
3059 !Subtarget->hasGWSSemaReleaseAll())) {
3060 // Let this error.
3061 SelectCode(N);
3062 return;
3063 }
3064
3065 // Chain, intrinsic ID, vsrc, offset
3066 const bool HasVSrc = N->getNumOperands() == 4;
3067 assert(HasVSrc || N->getNumOperands() == 3);
3068
3069 SDLoc SL(N);
3070 SDValue BaseOffset = N->getOperand(HasVSrc ? 3 : 2);
3071 int ImmOffset = 0;
3072 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
3073 MachineMemOperand *MMO = M->getMemOperand();
3074
3075 // Don't worry if the offset ends up in a VGPR. Only one lane will have
3076 // effect, so SIFixSGPRCopies will validly insert readfirstlane.
3077
3078 // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
3079 // offset field) % 64. Some versions of the programming guide omit the m0
3080 // part, or claim it's from offset 0.
3081 if (ConstantSDNode *ConstOffset = dyn_cast<ConstantSDNode>(BaseOffset)) {
3082 // If we have a constant offset, try to use the 0 in m0 as the base.
3083 // TODO: Look into changing the default m0 initialization value. If the
3084 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
3085 // the immediate offset.
3086 glueCopyToM0(N, CurDAG->getTargetConstant(0, SL, MVT::i32));
3087 ImmOffset = ConstOffset->getZExtValue();
3088 } else {
3089 if (CurDAG->isBaseWithConstantOffset(BaseOffset)) {
3090 ImmOffset = BaseOffset.getConstantOperandVal(1);
3091 BaseOffset = BaseOffset.getOperand(0);
3092 }
3093
3094 // Prefer to do the shift in an SGPR since it should be possible to use m0
3095 // as the result directly. If it's already an SGPR, it will be eliminated
3096 // later.
3097 SDNode *SGPROffset
3098 = CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL, MVT::i32,
3099 BaseOffset);
3100 // Shift to offset in m0
3101 SDNode *M0Base
3102 = CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,
3103 SDValue(SGPROffset, 0),
3104 CurDAG->getTargetConstant(16, SL, MVT::i32));
3105 glueCopyToM0(N, SDValue(M0Base, 0));
3106 }
3107
3108 SDValue Chain = N->getOperand(0);
3109 SDValue OffsetField = CurDAG->getTargetConstant(ImmOffset, SL, MVT::i32);
3110
3111 const unsigned Opc = gwsIntrinToOpcode(IntrID);
3112
3113 const MCInstrDesc &InstrDesc = TII->get(Opc);
3114 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
3115
3116 const TargetRegisterClass *DataRC = TII->getRegClass(InstrDesc, Data0Idx);
3117
3119 if (HasVSrc) {
3120 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3121
3122 SDValue Data = N->getOperand(2);
3123 MVT DataVT = Data.getValueType().getSimpleVT();
3124 if (TRI->isTypeLegalForClass(*DataRC, DataVT)) {
3125 // Normal 32-bit case.
3126 Ops.push_back(N->getOperand(2));
3127 } else {
3128 // Operand is really 32-bits, but requires 64-bit alignment, so use the
3129 // even aligned 64-bit register class.
3130 const SDValue RegSeqOps[] = {
3131 CurDAG->getTargetConstant(DataRC->getID(), SL, MVT::i32), Data,
3132 CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
3133 SDValue(
3134 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, SL, MVT::i32),
3135 0),
3136 CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32)};
3137
3138 Ops.push_back(SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE,
3139 SL, MVT::v2i32, RegSeqOps),
3140 0));
3141 }
3142 }
3143
3144 Ops.push_back(OffsetField);
3145 Ops.push_back(Chain);
3146
3147 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
3148 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
3149}
3150
3151void AMDGPUDAGToDAGISel::SelectInterpP1F16(SDNode *N) {
3152 if (Subtarget->getLDSBankCount() != 16) {
3153 // This is a single instruction with a pattern.
3154 SelectCode(N);
3155 return;
3156 }
3157
3158 SDLoc DL(N);
3159
3160 // This requires 2 instructions. It is possible to write a pattern to support
3161 // this, but the generated isel emitter doesn't correctly deal with multiple
3162 // output instructions using the same physical register input. The copy to m0
3163 // is incorrectly placed before the second instruction.
3164 //
3165 // TODO: Match source modifiers.
3166 //
3167 // def : Pat <
3168 // (int_amdgcn_interp_p1_f16
3169 // (VOP3Mods f32:$src0, i32:$src0_modifiers),
3170 // (i32 timm:$attrchan), (i32 timm:$attr),
3171 // (i1 timm:$high), M0),
3172 // (V_INTERP_P1LV_F16 $src0_modifiers, VGPR_32:$src0, timm:$attr,
3173 // timm:$attrchan, 0,
3174 // (V_INTERP_MOV_F32 2, timm:$attr, timm:$attrchan), timm:$high)> {
3175 // let Predicates = [has16BankLDS];
3176 // }
3177
3178 // 16 bank LDS
3179 SDValue ToM0 = CurDAG->getCopyToReg(CurDAG->getEntryNode(), DL, AMDGPU::M0,
3180 N->getOperand(5), SDValue());
3181
3182 SDVTList VTs = CurDAG->getVTList(MVT::f32, MVT::Other);
3183
3184 SDNode *InterpMov =
3185 CurDAG->getMachineNode(AMDGPU::V_INTERP_MOV_F32, DL, VTs, {
3186 CurDAG->getTargetConstant(2, DL, MVT::i32), // P0
3187 N->getOperand(3), // Attr
3188 N->getOperand(2), // Attrchan
3189 ToM0.getValue(1) // In glue
3190 });
3191
3192 SDNode *InterpP1LV =
3193 CurDAG->getMachineNode(AMDGPU::V_INTERP_P1LV_F16, DL, MVT::f32, {
3194 CurDAG->getTargetConstant(0, DL, MVT::i32), // $src0_modifiers
3195 N->getOperand(1), // Src0
3196 N->getOperand(3), // Attr
3197 N->getOperand(2), // Attrchan
3198 CurDAG->getTargetConstant(0, DL, MVT::i32), // $src2_modifiers
3199 SDValue(InterpMov, 0), // Src2 - holds two f16 values selected by high
3200 N->getOperand(4), // high
3201 CurDAG->getTargetConstant(0, DL, MVT::i1), // $clamp
3202 CurDAG->getTargetConstant(0, DL, MVT::i32), // $omod
3203 SDValue(InterpMov, 1)
3204 });
3205
3206 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), SDValue(InterpP1LV, 0));
3207}
3208
3209void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
3210 unsigned IntrID = N->getConstantOperandVal(1);
3211 switch (IntrID) {
3212 case Intrinsic::amdgcn_ds_append:
3213 case Intrinsic::amdgcn_ds_consume: {
3214 if (N->getValueType(0) != MVT::i32)
3215 break;
3216 SelectDSAppendConsume(N, IntrID);
3217 return;
3218 }
3219 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
3220 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
3221 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
3222 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
3223 SelectDSBvhStackIntrinsic(N, IntrID);
3224 return;
3225 case Intrinsic::amdgcn_init_whole_wave:
3226 CurDAG->getMachineFunction()
3227 .getInfo<SIMachineFunctionInfo>()
3228 ->setInitWholeWave();
3229 break;
3230 }
3231
3232 SelectCode(N);
3233}
3234
3235void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) {
3236 unsigned IntrID = N->getConstantOperandVal(0);
3237 unsigned Opcode = AMDGPU::INSTRUCTION_LIST_END;
3238 SDNode *ConvGlueNode = N->getGluedNode();
3239 if (ConvGlueNode) {
3240 // FIXME: Possibly iterate over multiple glue nodes?
3241 assert(ConvGlueNode->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
3242 ConvGlueNode = ConvGlueNode->getOperand(0).getNode();
3243 ConvGlueNode =
3244 CurDAG->getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, {},
3245 MVT::Glue, SDValue(ConvGlueNode, 0));
3246 } else {
3247 ConvGlueNode = nullptr;
3248 }
3249 switch (IntrID) {
3250 case Intrinsic::amdgcn_wqm:
3251 Opcode = AMDGPU::WQM;
3252 break;
3253 case Intrinsic::amdgcn_softwqm:
3254 Opcode = AMDGPU::SOFT_WQM;
3255 break;
3256 case Intrinsic::amdgcn_wwm:
3257 case Intrinsic::amdgcn_strict_wwm:
3258 Opcode = AMDGPU::STRICT_WWM;
3259 break;
3260 case Intrinsic::amdgcn_strict_wqm:
3261 Opcode = AMDGPU::STRICT_WQM;
3262 break;
3263 case Intrinsic::amdgcn_interp_p1_f16:
3264 SelectInterpP1F16(N);
3265 return;
3266 case Intrinsic::amdgcn_permlane16_swap:
3267 case Intrinsic::amdgcn_permlane32_swap: {
3268 if ((IntrID == Intrinsic::amdgcn_permlane16_swap &&
3269 !Subtarget->hasPermlane16Swap()) ||
3270 (IntrID == Intrinsic::amdgcn_permlane32_swap &&
3271 !Subtarget->hasPermlane32Swap())) {
3272 SelectCode(N); // Hit the default error
3273 return;
3274 }
3275
3276 Opcode = IntrID == Intrinsic::amdgcn_permlane16_swap
3277 ? AMDGPU::V_PERMLANE16_SWAP_B32_e64
3278 : AMDGPU::V_PERMLANE32_SWAP_B32_e64;
3279
3280 SmallVector<SDValue, 4> NewOps(N->op_begin() + 1, N->op_end());
3281 if (ConvGlueNode)
3282 NewOps.push_back(SDValue(ConvGlueNode, 0));
3283
3284 bool FI = N->getConstantOperandVal(3);
3285 NewOps[2] = CurDAG->getTargetConstant(
3286 FI ? AMDGPU::DPP::DPP_FI_1 : AMDGPU::DPP::DPP_FI_0, SDLoc(), MVT::i32);
3287
3288 CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), NewOps);
3289 return;
3290 }
3291 default:
3292 SelectCode(N);
3293 break;
3294 }
3295
3296 if (Opcode != AMDGPU::INSTRUCTION_LIST_END) {
3297 SDValue Src = N->getOperand(1);
3298 CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), {Src});
3299 }
3300
3301 if (ConvGlueNode) {
3302 SmallVector<SDValue, 4> NewOps(N->ops());
3303 NewOps.push_back(SDValue(ConvGlueNode, 0));
3304 CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), NewOps);
3305 }
3306}
3307
3308void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) {
3309 unsigned IntrID = N->getConstantOperandVal(1);
3310 switch (IntrID) {
3311 case Intrinsic::amdgcn_ds_gws_init:
3312 case Intrinsic::amdgcn_ds_gws_barrier:
3313 case Intrinsic::amdgcn_ds_gws_sema_v:
3314 case Intrinsic::amdgcn_ds_gws_sema_br:
3315 case Intrinsic::amdgcn_ds_gws_sema_p:
3316 case Intrinsic::amdgcn_ds_gws_sema_release_all:
3317 SelectDS_GWS(N, IntrID);
3318 return;
3319 case Intrinsic::amdgcn_tensor_load_to_lds:
3320 case Intrinsic::amdgcn_tensor_store_from_lds:
3321 SelectTensorLoadStore(N, IntrID);
3322 return;
3323 default:
3324 break;
3325 }
3326
3327 SelectCode(N);
3328}
3329
3330void AMDGPUDAGToDAGISel::SelectWAVE_ADDRESS(SDNode *N) {
3331 SDValue Log2WaveSize =
3332 CurDAG->getTargetConstant(Subtarget->getWavefrontSizeLog2(), SDLoc(N), MVT::i32);
3333 CurDAG->SelectNodeTo(N, AMDGPU::S_LSHR_B32, N->getVTList(),
3334 {N->getOperand(0), Log2WaveSize});
3335}
3336
3337void AMDGPUDAGToDAGISel::SelectSTACKRESTORE(SDNode *N) {
3338 SDValue SrcVal = N->getOperand(1);
3339 if (SrcVal.getValueType() != MVT::i32) {
3340 SelectCode(N); // Emit default error
3341 return;
3342 }
3343
3344 SDValue CopyVal;
3345 Register SP = TLI->getStackPointerRegisterToSaveRestore();
3346 SDLoc SL(N);
3347
3348 if (SrcVal.getOpcode() == AMDGPUISD::WAVE_ADDRESS) {
3349 CopyVal = SrcVal.getOperand(0);
3350 } else {
3351 SDValue Log2WaveSize = CurDAG->getTargetConstant(
3352 Subtarget->getWavefrontSizeLog2(), SL, MVT::i32);
3353
3354 if (N->isDivergent()) {
3355 SrcVal = SDValue(CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL,
3356 MVT::i32, SrcVal),
3357 0);
3358 }
3359
3360 CopyVal = SDValue(CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,
3361 {SrcVal, Log2WaveSize}),
3362 0);
3363 }
3364
3365 SDValue CopyToSP = CurDAG->getCopyToReg(N->getOperand(0), SL, SP, CopyVal);
3366 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), CopyToSP);
3367}
3368
3369bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src,
3370 unsigned &Mods,
3371 bool IsCanonicalizing,
3372 bool AllowAbs) const {
3373 Mods = SISrcMods::NONE;
3374 Src = In;
3375
3376 if (Src.getOpcode() == ISD::FNEG) {
3377 Mods |= SISrcMods::NEG;
3378 Src = Src.getOperand(0);
3379 } else if (Src.getOpcode() == ISD::FSUB && IsCanonicalizing) {
3380 // Fold fsub [+-]0 into fneg. This may not have folded depending on the
3381 // denormal mode, but we're implicitly canonicalizing in a source operand.
3382 auto *LHS = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
3383 if (LHS && LHS->isZero()) {
3384 Mods |= SISrcMods::NEG;
3385 Src = Src.getOperand(1);
3386 }
3387 }
3388
3389 if (AllowAbs && Src.getOpcode() == ISD::FABS) {
3390 Mods |= SISrcMods::ABS;
3391 Src = Src.getOperand(0);
3392 }
3393
3394 if (Mods != SISrcMods::NONE)
3395 return true;
3396
3397 // Convert various sign-bit masks on integers to src mods. Currently disabled
3398 // for 16-bit types as the codegen replaces the operand without adding a
3399 // srcmod. This is intentionally finding the cases where we are performing
3400 // float neg and abs on int types, the goal is not to obtain two's complement
3401 // neg or abs. Limit converison to select operands via the nonCanonalizing
3402 // pattern.
3403 // TODO: Add 16-bit support.
3404 if (IsCanonicalizing)
3405 return true;
3406
3407 // v2i32 xor/or/and are legal. A vselect using these instructions as operands
3408 // is scalarised into two selects with EXTRACT_VECTOR_ELT operands. Peek
3409 // through the extract to the bitwise op.
3410 SDValue PeekSrc =
3411 Src->getOpcode() == ISD::EXTRACT_VECTOR_ELT ? Src->getOperand(0) : Src;
3412 // Convert various sign-bit masks to src mods. Currently disabled for 16-bit
3413 // types as the codegen replaces the operand without adding a srcmod.
3414 // This is intentionally finding the cases where we are performing float neg
3415 // and abs on int types, the goal is not to obtain two's complement neg or
3416 // abs.
3417 // TODO: Add 16-bit support.
3418 unsigned Opc = PeekSrc.getOpcode();
3419 EVT VT = Src.getValueType();
3420 if ((Opc != ISD::AND && Opc != ISD::OR && Opc != ISD::XOR) ||
3421 (VT != MVT::i32 && VT != MVT::v2i32 && VT != MVT::i64))
3422 return true;
3423
3424 ConstantSDNode *CRHS = isConstOrConstSplat(PeekSrc->getOperand(1));
3425 if (!CRHS)
3426 return true;
3427
3428 auto ReplaceSrc = [&]() -> SDValue {
3429 if (Src->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
3430 return Src.getOperand(0);
3431
3432 SDValue LHS = PeekSrc->getOperand(0);
3433 SDValue Index = Src->getOperand(1);
3434 return CurDAG->getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Src),
3435 Src.getValueType(), LHS, Index);
3436 };
3437
3438 // Recognise Srcmods:
3439 // (xor a, 0x80000000) or v2i32 (xor a, {0x80000000,0x80000000}) as NEG.
3440 // (and a, 0x7fffffff) or v2i32 (and a, {0x7fffffff,0x7fffffff}) as ABS.
3441 // (or a, 0x80000000) or v2i32 (or a, {0x80000000,0x80000000}) as NEG+ABS
3442 // SrcModifiers.
3443 if (Opc == ISD::XOR && CRHS->getAPIntValue().isSignMask()) {
3444 Mods |= SISrcMods::NEG;
3445 Src = ReplaceSrc();
3446 } else if (Opc == ISD::AND && AllowAbs &&
3447 CRHS->getAPIntValue().isMaxSignedValue()) {
3448 Mods |= SISrcMods::ABS;
3449 Src = ReplaceSrc();
3450 } else if (Opc == ISD::OR && AllowAbs && CRHS->getAPIntValue().isSignMask()) {
3452 Src = ReplaceSrc();
3453 }
3454
3455 return true;
3456}
3457
3458bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src,
3459 SDValue &SrcMods) const {
3460 unsigned Mods;
3461 if (SelectVOP3ModsImpl(In, Src, Mods, /*IsCanonicalizing=*/true,
3462 /*AllowAbs=*/true)) {
3463 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3464 return true;
3465 }
3466
3467 return false;
3468}
3469
3470bool AMDGPUDAGToDAGISel::SelectVOP3ModsNonCanonicalizing(
3471 SDValue In, SDValue &Src, SDValue &SrcMods) const {
3472 unsigned Mods;
3473 if (SelectVOP3ModsImpl(In, Src, Mods, /*IsCanonicalizing=*/false,
3474 /*AllowAbs=*/true)) {
3475 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3476 return true;
3477 }
3478
3479 return false;
3480}
3481
3482bool AMDGPUDAGToDAGISel::SelectVOP3BMods(SDValue In, SDValue &Src,
3483 SDValue &SrcMods) const {
3484 unsigned Mods;
3485 if (SelectVOP3ModsImpl(In, Src, Mods,
3486 /*IsCanonicalizing=*/true,
3487 /*AllowAbs=*/false)) {
3488 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3489 return true;
3490 }
3491
3492 return false;
3493}
3494
3495bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src) const {
3496 if (In.getOpcode() == ISD::FABS || In.getOpcode() == ISD::FNEG)
3497 return false;
3498
3499 Src = In;
3500 return true;
3501}
3502
3503bool AMDGPUDAGToDAGISel::SelectVINTERPModsImpl(SDValue In, SDValue &Src,
3504 SDValue &SrcMods,
3505 bool OpSel) const {
3506 unsigned Mods;
3507 if (SelectVOP3ModsImpl(In, Src, Mods,
3508 /*IsCanonicalizing=*/true,
3509 /*AllowAbs=*/false)) {
3510 if (OpSel)
3511 Mods |= SISrcMods::OP_SEL_0;
3512 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3513 return true;
3514 }
3515
3516 return false;
3517}
3518
3519bool AMDGPUDAGToDAGISel::SelectVINTERPMods(SDValue In, SDValue &Src,
3520 SDValue &SrcMods) const {
3521 return SelectVINTERPModsImpl(In, Src, SrcMods, /* OpSel */ false);
3522}
3523
3524bool AMDGPUDAGToDAGISel::SelectVINTERPModsHi(SDValue In, SDValue &Src,
3525 SDValue &SrcMods) const {
3526 return SelectVINTERPModsImpl(In, Src, SrcMods, /* OpSel */ true);
3527}
3528
3529bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src,
3530 SDValue &SrcMods, SDValue &Clamp,
3531 SDValue &Omod) const {
3532 SDLoc DL(In);
3533 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
3534 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
3535
3536 return SelectVOP3Mods(In, Src, SrcMods);
3537}
3538
3539bool AMDGPUDAGToDAGISel::SelectVOP3BMods0(SDValue In, SDValue &Src,
3540 SDValue &SrcMods, SDValue &Clamp,
3541 SDValue &Omod) const {
3542 SDLoc DL(In);
3543 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
3544 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
3545
3546 return SelectVOP3BMods(In, Src, SrcMods);
3547}
3548
3549bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In, SDValue &Src,
3550 SDValue &Clamp, SDValue &Omod) const {
3551 Src = In;
3552
3553 SDLoc DL(In);
3554 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
3555 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
3556
3557 return true;
3558}
3559
3560bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
3561 SDValue &SrcMods, bool IsDOT) const {
3562 unsigned Mods = SISrcMods::NONE;
3563 Src = In;
3564
3565 // TODO: Handle G_FSUB 0 as fneg
3566 if (Src.getOpcode() == ISD::FNEG) {
3568 Src = Src.getOperand(0);
3569 }
3570
3571 if (Src.getOpcode() == ISD::BUILD_VECTOR && Src.getNumOperands() == 2 &&
3572 (!IsDOT || !Subtarget->hasDOTOpSelHazard())) {
3573 unsigned VecMods = Mods;
3574
3575 SDValue Lo = stripBitcast(Src.getOperand(0));
3576 SDValue Hi = stripBitcast(Src.getOperand(1));
3577
3578 if (Lo.getOpcode() == ISD::FNEG) {
3579 Lo = stripBitcast(Lo.getOperand(0));
3580 Mods ^= SISrcMods::NEG;
3581 }
3582
3583 if (Hi.getOpcode() == ISD::FNEG) {
3584 Hi = stripBitcast(Hi.getOperand(0));
3585 Mods ^= SISrcMods::NEG_HI;
3586 }
3587
3588 if (isExtractHiElt(Lo, Lo))
3589 Mods |= SISrcMods::OP_SEL_0;
3590
3591 if (isExtractHiElt(Hi, Hi))
3592 Mods |= SISrcMods::OP_SEL_1;
3593
3594 unsigned VecSize = Src.getValueSizeInBits();
3595 Lo = stripExtractLoElt(Lo);
3596 Hi = stripExtractLoElt(Hi);
3597
3598 if (Lo.getValueSizeInBits() > VecSize) {
3599 Lo = CurDAG->getTargetExtractSubreg(
3600 (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
3601 MVT::getIntegerVT(VecSize), Lo);
3602 }
3603
3604 if (Hi.getValueSizeInBits() > VecSize) {
3605 Hi = CurDAG->getTargetExtractSubreg(
3606 (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
3607 MVT::getIntegerVT(VecSize), Hi);
3608 }
3609
3610 assert(Lo.getValueSizeInBits() <= VecSize &&
3611 Hi.getValueSizeInBits() <= VecSize);
3612
3613 if (Lo == Hi && !isInlineImmediate(Lo.getNode())) {
3614 // Really a scalar input. Just select from the low half of the register to
3615 // avoid packing.
3616
3617 if (VecSize == Lo.getValueSizeInBits()) {
3618 Src = Lo;
3619 } else if (VecSize == 32) {
3620 Src = createVOP3PSrc32FromLo16(Lo, Src, CurDAG, Subtarget);
3621 } else {
3622 assert(Lo.getValueSizeInBits() == 32 && VecSize == 64);
3623
3624 SDLoc SL(In);
3626 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, SL,
3627 Lo.getValueType()), 0);
3628 auto RC = Lo->isDivergent() ? AMDGPU::VReg_64RegClassID
3629 : AMDGPU::SReg_64RegClassID;
3630 const SDValue Ops[] = {
3631 CurDAG->getTargetConstant(RC, SL, MVT::i32),
3632 Lo, CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
3633 Undef, CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32) };
3634
3635 Src = SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, SL,
3636 Src.getValueType(), Ops), 0);
3637 }
3638 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3639 return true;
3640 }
3641
3642 if (VecSize == 64 && Lo == Hi && isa<ConstantFPSDNode>(Lo)) {
3643 uint64_t Lit = cast<ConstantFPSDNode>(Lo)->getValueAPF()
3644 .bitcastToAPInt().getZExtValue();
3645 if (AMDGPU::isInlinableLiteral32(Lit, Subtarget->hasInv2PiInlineImm())) {
3646 Src = CurDAG->getTargetConstant(Lit, SDLoc(In), MVT::i64);
3647 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3648 return true;
3649 }
3650 }
3651
3652 Mods = VecMods;
3653 } else if (Src.getOpcode() == ISD::VECTOR_SHUFFLE &&
3654 Src.getNumOperands() == 2) {
3655
3656 // TODO: We should repeat the build_vector source check above for the
3657 // vector_shuffle for negates and casts of individual elements.
3658
3659 auto *SVN = cast<ShuffleVectorSDNode>(Src);
3660 ArrayRef<int> Mask = SVN->getMask();
3661
3662 if (Mask[0] < 2 && Mask[1] < 2) {
3663 // src1 should be undef.
3664 SDValue ShuffleSrc = SVN->getOperand(0);
3665
3666 if (ShuffleSrc.getOpcode() == ISD::FNEG) {
3667 ShuffleSrc = ShuffleSrc.getOperand(0);
3669 }
3670
3671 if (Mask[0] == 1)
3672 Mods |= SISrcMods::OP_SEL_0;
3673 if (Mask[1] == 1)
3674 Mods |= SISrcMods::OP_SEL_1;
3675
3676 Src = ShuffleSrc;
3677 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3678 return true;
3679 }
3680 }
3681
3682 // Packed instructions do not have abs modifiers.
3683 Mods |= SISrcMods::OP_SEL_1;
3684
3685 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3686 return true;
3687}
3688
3689bool AMDGPUDAGToDAGISel::SelectVOP3PModsDOT(SDValue In, SDValue &Src,
3690 SDValue &SrcMods) const {
3691 return SelectVOP3PMods(In, Src, SrcMods, true);
3692}
3693
3694bool AMDGPUDAGToDAGISel::SelectWMMAOpSelVOP3PMods(SDValue In,
3695 SDValue &Src) const {
3696 const ConstantSDNode *C = cast<ConstantSDNode>(In);
3697 assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value");
3698
3699 unsigned Mods = SISrcMods::OP_SEL_1;
3700 unsigned SrcVal = C->getZExtValue();
3701 if (SrcVal == 1)
3702 Mods |= SISrcMods::OP_SEL_0;
3703
3704 Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3705 return true;
3706}
3707
3709 llvm::SelectionDAG *CurDAG,
3710 const SDLoc &DL) {
3711 unsigned DstRegClass;
3712 EVT DstTy;
3713 switch (Elts.size()) {
3714 case 8:
3715 DstRegClass = AMDGPU::VReg_256RegClassID;
3716 DstTy = MVT::v8i32;
3717 break;
3718 case 4:
3719 DstRegClass = AMDGPU::VReg_128RegClassID;
3720 DstTy = MVT::v4i32;
3721 break;
3722 case 2:
3723 DstRegClass = AMDGPU::VReg_64RegClassID;
3724 DstTy = MVT::v2i32;
3725 break;
3726 default:
3727 llvm_unreachable("unhandled Reg sequence size");
3728 }
3729
3731 Ops.push_back(CurDAG->getTargetConstant(DstRegClass, DL, MVT::i32));
3732 for (unsigned i = 0; i < Elts.size(); ++i) {
3733 Ops.push_back(Elts[i]);
3734 Ops.push_back(CurDAG->getTargetConstant(
3736 }
3737 return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, DstTy, Ops);
3738}
3739
3741 llvm::SelectionDAG *CurDAG,
3742 const SDLoc &DL) {
3743 SmallVector<SDValue, 8> PackedElts;
3744 assert("unhandled Reg sequence size" &&
3745 (Elts.size() == 8 || Elts.size() == 16));
3746
3747 // Pack 16-bit elements in pairs into 32-bit register. If both elements are
3748 // unpacked from 32-bit source use it, otherwise pack them using v_perm.
3749 for (unsigned i = 0; i < Elts.size(); i += 2) {
3750 SDValue LoSrc = stripExtractLoElt(stripBitcast(Elts[i]));
3751 SDValue HiSrc;
3752 if (isExtractHiElt(Elts[i + 1], HiSrc) && LoSrc == HiSrc) {
3753 PackedElts.push_back(HiSrc);
3754 } else {
3755 SDValue PackLoLo = CurDAG->getTargetConstant(0x05040100, DL, MVT::i32);
3756 MachineSDNode *Packed =
3757 CurDAG->getMachineNode(AMDGPU::V_PERM_B32_e64, DL, MVT::i32,
3758 {Elts[i + 1], Elts[i], PackLoLo});
3759 PackedElts.push_back(SDValue(Packed, 0));
3760 }
3761 }
3762
3763 return buildRegSequence32(PackedElts, CurDAG, DL);
3764}
3765
3767 llvm::SelectionDAG *CurDAG,
3768 const SDLoc &DL, unsigned ElementSize) {
3769 if (ElementSize == 16)
3770 return buildRegSequence16(Elts, CurDAG, DL);
3771 if (ElementSize == 32)
3772 return buildRegSequence32(Elts, CurDAG, DL);
3773 llvm_unreachable("Unhandled element size");
3774}
3775
3776static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods,
3778 llvm::SelectionDAG *CurDAG, const SDLoc &DL,
3779 unsigned ElementSize) {
3780 if (ModOpcode == ISD::FNEG) {
3781 Mods |= SISrcMods::NEG;
3782 // Check if all elements also have abs modifier
3783 SmallVector<SDValue, 8> NegAbsElts;
3784 for (auto El : Elts) {
3785 if (El.getOpcode() != ISD::FABS)
3786 break;
3787 NegAbsElts.push_back(El->getOperand(0));
3788 }
3789 if (Elts.size() != NegAbsElts.size()) {
3790 // Neg
3791 Src = SDValue(buildRegSequence(Elts, CurDAG, DL, ElementSize), 0);
3792 } else {
3793 // Neg and Abs
3794 Mods |= SISrcMods::NEG_HI;
3795 Src = SDValue(buildRegSequence(NegAbsElts, CurDAG, DL, ElementSize), 0);
3796 }
3797 } else {
3798 assert(ModOpcode == ISD::FABS);
3799 // Abs
3800 Mods |= SISrcMods::NEG_HI;
3801 Src = SDValue(buildRegSequence(Elts, CurDAG, DL, ElementSize), 0);
3802 }
3803}
3804
3805// Check all f16 elements for modifiers while looking through b32 and v2b16
3806// build vector, stop if element does not satisfy ModifierCheck.
3807static void
3809 std::function<bool(SDValue)> ModifierCheck) {
3810 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3811 if (auto *F16Pair =
3812 dyn_cast<BuildVectorSDNode>(stripBitcast(BV->getOperand(i)))) {
3813 for (unsigned i = 0; i < F16Pair->getNumOperands(); ++i) {
3814 SDValue ElF16 = stripBitcast(F16Pair->getOperand(i));
3815 if (!ModifierCheck(ElF16))
3816 break;
3817 }
3818 }
3819 }
3820}
3821
3822bool AMDGPUDAGToDAGISel::SelectWMMAModsF16Neg(SDValue In, SDValue &Src,
3823 SDValue &SrcMods) const {
3824 Src = In;
3825 unsigned Mods = SISrcMods::OP_SEL_1;
3826
3827 // mods are on f16 elements
3828 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3830
3831 checkWMMAElementsModifiersF16(BV, [&](SDValue Element) -> bool {
3832 if (Element.getOpcode() != ISD::FNEG)
3833 return false;
3834 EltsF16.push_back(Element.getOperand(0));
3835 return true;
3836 });
3837
3838 // All elements have neg modifier
3839 if (BV->getNumOperands() * 2 == EltsF16.size()) {
3840 Src = SDValue(buildRegSequence16(EltsF16, CurDAG, SDLoc(In)), 0);
3841 Mods |= SISrcMods::NEG;
3842 Mods |= SISrcMods::NEG_HI;
3843 }
3844 }
3845
3846 // mods are on v2f16 elements
3847 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3848 SmallVector<SDValue, 8> EltsV2F16;
3849 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3850 SDValue ElV2f16 = stripBitcast(BV->getOperand(i));
3851 // Based on first element decide which mod we match, neg or abs
3852 if (ElV2f16.getOpcode() != ISD::FNEG)
3853 break;
3854 EltsV2F16.push_back(ElV2f16.getOperand(0));
3855 }
3856
3857 // All pairs of elements have neg modifier
3858 if (BV->getNumOperands() == EltsV2F16.size()) {
3859 Src = SDValue(buildRegSequence32(EltsV2F16, CurDAG, SDLoc(In)), 0);
3860 Mods |= SISrcMods::NEG;
3861 Mods |= SISrcMods::NEG_HI;
3862 }
3863 }
3864
3865 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3866 return true;
3867}
3868
3869bool AMDGPUDAGToDAGISel::SelectWMMAModsF16NegAbs(SDValue In, SDValue &Src,
3870 SDValue &SrcMods) const {
3871 Src = In;
3872 unsigned Mods = SISrcMods::OP_SEL_1;
3873 unsigned ModOpcode;
3874
3875 // mods are on f16 elements
3876 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3878 checkWMMAElementsModifiersF16(BV, [&](SDValue ElF16) -> bool {
3879 // Based on first element decide which mod we match, neg or abs
3880 if (EltsF16.empty())
3881 ModOpcode = (ElF16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3882 if (ElF16.getOpcode() != ModOpcode)
3883 return false;
3884 EltsF16.push_back(ElF16.getOperand(0));
3885 return true;
3886 });
3887
3888 // All elements have ModOpcode modifier
3889 if (BV->getNumOperands() * 2 == EltsF16.size())
3890 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF16, Src, CurDAG, SDLoc(In),
3891 16);
3892 }
3893
3894 // mods are on v2f16 elements
3895 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3896 SmallVector<SDValue, 8> EltsV2F16;
3897
3898 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3899 SDValue ElV2f16 = stripBitcast(BV->getOperand(i));
3900 // Based on first element decide which mod we match, neg or abs
3901 if (EltsV2F16.empty())
3902 ModOpcode = (ElV2f16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3903 if (ElV2f16->getOpcode() != ModOpcode)
3904 break;
3905 EltsV2F16.push_back(ElV2f16->getOperand(0));
3906 }
3907
3908 // All elements have ModOpcode modifier
3909 if (BV->getNumOperands() == EltsV2F16.size())
3910 selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, CurDAG, SDLoc(In),
3911 32);
3912 }
3913
3914 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3915 return true;
3916}
3917
3918bool AMDGPUDAGToDAGISel::SelectWMMAModsF32NegAbs(SDValue In, SDValue &Src,
3919 SDValue &SrcMods) const {
3920 Src = In;
3921 unsigned Mods = SISrcMods::OP_SEL_1;
3923
3924 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3925 assert(BV->getNumOperands() > 0);
3926 // Based on first element decide which mod we match, neg or abs
3927 SDValue ElF32 = stripBitcast(BV->getOperand(0));
3928 unsigned ModOpcode =
3929 (ElF32.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3930 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3931 SDValue ElF32 = stripBitcast(BV->getOperand(i));
3932 if (ElF32.getOpcode() != ModOpcode)
3933 break;
3934 EltsF32.push_back(ElF32.getOperand(0));
3935 }
3936
3937 // All elements had ModOpcode modifier
3938 if (BV->getNumOperands() == EltsF32.size())
3939 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, CurDAG, SDLoc(In),
3940 32);
3941 }
3942
3943 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3944 return true;
3945}
3946
3947bool AMDGPUDAGToDAGISel::SelectWMMAVISrc(SDValue In, SDValue &Src) const {
3948 if (auto *BV = dyn_cast<BuildVectorSDNode>(In)) {
3949 BitVector UndefElements;
3950 if (SDValue Splat = BV->getSplatValue(&UndefElements))
3951 if (isInlineImmediate(Splat.getNode())) {
3952 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Splat)) {
3953 unsigned Imm = C->getAPIntValue().getSExtValue();
3954 Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32);
3955 return true;
3956 }
3957 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Splat)) {
3958 unsigned Imm = C->getValueAPF().bitcastToAPInt().getSExtValue();
3959 Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32);
3960 return true;
3961 }
3962 llvm_unreachable("unhandled Constant node");
3963 }
3964 }
3965
3966 // 16 bit splat
3967 SDValue SplatSrc32 = stripBitcast(In);
3968 if (auto *SplatSrc32BV = dyn_cast<BuildVectorSDNode>(SplatSrc32))
3969 if (SDValue Splat32 = SplatSrc32BV->getSplatValue()) {
3970 SDValue SplatSrc16 = stripBitcast(Splat32);
3971 if (auto *SplatSrc16BV = dyn_cast<BuildVectorSDNode>(SplatSrc16))
3972 if (SDValue Splat = SplatSrc16BV->getSplatValue()) {
3973 const SIInstrInfo *TII = Subtarget->getInstrInfo();
3974 std::optional<APInt> RawValue;
3975 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Splat))
3976 RawValue = C->getValueAPF().bitcastToAPInt();
3977 else if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Splat))
3978 RawValue = C->getAPIntValue();
3979
3980 if (RawValue.has_value()) {
3981 EVT VT = In.getValueType().getScalarType();
3982 if (VT.getSimpleVT() == MVT::f16 || VT.getSimpleVT() == MVT::bf16) {
3983 APFloat FloatVal(VT.getSimpleVT() == MVT::f16
3986 RawValue.value());
3987 if (TII->isInlineConstant(FloatVal)) {
3988 Src = CurDAG->getTargetConstant(RawValue.value(), SDLoc(In),
3989 MVT::i16);
3990 return true;
3991 }
3992 } else if (VT.getSimpleVT() == MVT::i16) {
3993 if (TII->isInlineConstant(RawValue.value())) {
3994 Src = CurDAG->getTargetConstant(RawValue.value(), SDLoc(In),
3995 MVT::i16);
3996 return true;
3997 }
3998 } else
3999 llvm_unreachable("unknown 16-bit type");
4000 }
4001 }
4002 }
4003
4004 return false;
4005}
4006
4007bool AMDGPUDAGToDAGISel::SelectSWMMACIndex8(SDValue In, SDValue &Src,
4008 SDValue &IndexKey) const {
4009 unsigned Key = 0;
4010 Src = In;
4011
4012 if (In.getOpcode() == ISD::SRL) {
4013 const llvm::SDValue &ShiftSrc = In.getOperand(0);
4014 ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(In.getOperand(1));
4015 if (ShiftSrc.getValueType().getSizeInBits() == 32 && ShiftAmt &&
4016 ShiftAmt->getZExtValue() % 8 == 0) {
4017 Key = ShiftAmt->getZExtValue() / 8;
4018 Src = ShiftSrc;
4019 }
4020 }
4021
4022 IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);
4023 return true;
4024}
4025
4026bool AMDGPUDAGToDAGISel::SelectSWMMACIndex16(SDValue In, SDValue &Src,
4027 SDValue &IndexKey) const {
4028 unsigned Key = 0;
4029 Src = In;
4030
4031 if (In.getOpcode() == ISD::SRL) {
4032 const llvm::SDValue &ShiftSrc = In.getOperand(0);
4033 ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(In.getOperand(1));
4034 if (ShiftSrc.getValueType().getSizeInBits() == 32 && ShiftAmt &&
4035 ShiftAmt->getZExtValue() == 16) {
4036 Key = 1;
4037 Src = ShiftSrc;
4038 }
4039 }
4040
4041 IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);
4042 return true;
4043}
4044
4045bool AMDGPUDAGToDAGISel::SelectSWMMACIndex32(SDValue In, SDValue &Src,
4046 SDValue &IndexKey) const {
4047 unsigned Key = 0;
4048 Src = In;
4049
4050 SDValue InI32;
4051
4052 if (In.getOpcode() == ISD::ANY_EXTEND || In.getOpcode() == ISD::ZERO_EXTEND) {
4053 const SDValue &ExtendSrc = In.getOperand(0);
4054 if (ExtendSrc.getValueSizeInBits() == 32)
4055 InI32 = ExtendSrc;
4056 } else if (In->getOpcode() == ISD::BITCAST) {
4057 const SDValue &CastSrc = In.getOperand(0);
4058 if (CastSrc.getOpcode() == ISD::BUILD_VECTOR &&
4059 CastSrc.getOperand(0).getValueSizeInBits() == 32) {
4060 ConstantSDNode *Zero = dyn_cast<ConstantSDNode>(CastSrc.getOperand(1));
4061 if (Zero && Zero->getZExtValue() == 0)
4062 InI32 = CastSrc.getOperand(0);
4063 }
4064 }
4065
4066 if (InI32 && InI32.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
4067 const SDValue &ExtractVecEltSrc = InI32.getOperand(0);
4068 ConstantSDNode *EltIdx = dyn_cast<ConstantSDNode>(InI32.getOperand(1));
4069 if (ExtractVecEltSrc.getValueSizeInBits() == 64 && EltIdx &&
4070 EltIdx->getZExtValue() == 1) {
4071 Key = 1;
4072 Src = ExtractVecEltSrc;
4073 }
4074 }
4075
4076 IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);
4077 return true;
4078}
4079
4080bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src,
4081 SDValue &SrcMods) const {
4082 Src = In;
4083 // FIXME: Handle op_sel
4084 SrcMods = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32);
4085 return true;
4086}
4087
4088bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods(SDValue In, SDValue &Src,
4089 SDValue &SrcMods) const {
4090 // FIXME: Handle op_sel
4091 return SelectVOP3Mods(In, Src, SrcMods);
4092}
4093
4094// Match lowered fpext from bf16 to f32. This is a bit operation extending
4095// a 16-bit value with 16-bit of zeroes at LSB:
4096//
4097// 1. (f32 (bitcast (build_vector (i16 0), (i16 (bitcast bf16:val)))))
4098// 2. (f32 (bitcast (and i32:val, 0xffff0000))) -> IsExtractHigh = true
4099// 3. (f32 (bitcast (shl i32:va, 16) -> IsExtractHigh = false
4100static SDValue matchBF16FPExtendLike(SDValue Op, bool &IsExtractHigh) {
4101 if (Op.getValueType() != MVT::f32 || Op.getOpcode() != ISD::BITCAST)
4102 return SDValue();
4103 Op = Op.getOperand(0);
4104
4105 IsExtractHigh = false;
4106 if (Op.getValueType() == MVT::v2i16 && Op.getOpcode() == ISD::BUILD_VECTOR) {
4107 auto Low16 = dyn_cast<ConstantSDNode>(Op.getOperand(0));
4108 if (!Low16 || !Low16->isZero())
4109 return SDValue();
4110 Op = stripBitcast(Op.getOperand(1));
4111 if (Op.getValueType() != MVT::bf16)
4112 return SDValue();
4113 return Op;
4114 }
4115
4116 if (Op.getValueType() != MVT::i32)
4117 return SDValue();
4118
4119 if (Op.getOpcode() == ISD::AND) {
4120 if (auto Mask = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
4121 if (Mask->getZExtValue() == 0xffff0000) {
4122 IsExtractHigh = true;
4123 return Op.getOperand(0);
4124 }
4125 }
4126 return SDValue();
4127 }
4128
4129 if (Op.getOpcode() == ISD::SHL) {
4130 if (auto Amt = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
4131 if (Amt->getZExtValue() == 16)
4132 return Op.getOperand(0);
4133 }
4134 }
4135
4136 return SDValue();
4137}
4138
4139// The return value is not whether the match is possible (which it always is),
4140// but whether or not it a conversion is really used.
4141bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src,
4142 unsigned &Mods,
4143 MVT VT) const {
4144 Mods = 0;
4145 SelectVOP3ModsImpl(In, Src, Mods);
4146
4147 bool IsExtractHigh = false;
4148 if (Src.getOpcode() == ISD::FP_EXTEND) {
4149 Src = Src.getOperand(0);
4150 } else if (VT == MVT::bf16) {
4151 SDValue B16 = matchBF16FPExtendLike(Src, IsExtractHigh);
4152 if (!B16)
4153 return false;
4154 Src = B16;
4155 } else
4156 return false;
4157
4158 if (Src.getValueType() != VT &&
4159 (VT != MVT::bf16 || Src.getValueType() != MVT::i32))
4160 return false;
4161
4162 Src = stripBitcast(Src);
4163
4164 // Be careful about folding modifiers if we already have an abs. fneg is
4165 // applied last, so we don't want to apply an earlier fneg.
4166 if ((Mods & SISrcMods::ABS) == 0) {
4167 unsigned ModsTmp;
4168 SelectVOP3ModsImpl(Src, Src, ModsTmp);
4169
4170 if ((ModsTmp & SISrcMods::NEG) != 0)
4171 Mods ^= SISrcMods::NEG;
4172
4173 if ((ModsTmp & SISrcMods::ABS) != 0)
4174 Mods |= SISrcMods::ABS;
4175 }
4176
4177 // op_sel/op_sel_hi decide the source type and source.
4178 // If the source's op_sel_hi is set, it indicates to do a conversion from
4179 // fp16. If the sources's op_sel is set, it picks the high half of the source
4180 // register.
4181
4182 Mods |= SISrcMods::OP_SEL_1;
4183 if (Src.getValueSizeInBits() == 16) {
4184 if (isExtractHiElt(Src, Src)) {
4185 Mods |= SISrcMods::OP_SEL_0;
4186
4187 // TODO: Should we try to look for neg/abs here?
4188 return true;
4189 }
4190
4191 if (Src.getOpcode() == ISD::TRUNCATE &&
4192 Src.getOperand(0).getValueType() == MVT::i32) {
4193 Src = Src.getOperand(0);
4194 return true;
4195 }
4196
4197 if (Subtarget->useRealTrue16Insts())
4198 // In true16 mode, pack src to a 32bit
4199 Src = createVOP3PSrc32FromLo16(Src, In, CurDAG, Subtarget);
4200 } else if (IsExtractHigh)
4201 Mods |= SISrcMods::OP_SEL_0;
4202
4203 return true;
4204}
4205
4206bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsExt(SDValue In, SDValue &Src,
4207 SDValue &SrcMods) const {
4208 unsigned Mods = 0;
4209 if (!SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::f16))
4210 return false;
4211 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
4212 return true;
4213}
4214
4215bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src,
4216 SDValue &SrcMods) const {
4217 unsigned Mods = 0;
4218 SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::f16);
4219 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
4220 return true;
4221}
4222
4223bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixBF16ModsExt(SDValue In, SDValue &Src,
4224 SDValue &SrcMods) const {
4225 unsigned Mods = 0;
4226 if (!SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::bf16))
4227 return false;
4228 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
4229 return true;
4230}
4231
4232bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixBF16Mods(SDValue In, SDValue &Src,
4233 SDValue &SrcMods) const {
4234 unsigned Mods = 0;
4235 SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::bf16);
4236 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
4237 return true;
4238}
4239
4240// Match BITOP3 operation and return a number of matched instructions plus
4241// truth table.
4242static std::pair<unsigned, uint8_t> BitOp3_Op(SDValue In,
4244 unsigned NumOpcodes = 0;
4245 uint8_t LHSBits, RHSBits;
4246
4247 auto getOperandBits = [&Src, In](SDValue Op, uint8_t &Bits) -> bool {
4248 // Define truth table given Src0, Src1, Src2 bits permutations:
4249 // 0 0 0
4250 // 0 0 1
4251 // 0 1 0
4252 // 0 1 1
4253 // 1 0 0
4254 // 1 0 1
4255 // 1 1 0
4256 // 1 1 1
4257 const uint8_t SrcBits[3] = { 0xf0, 0xcc, 0xaa };
4258
4259 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
4260 if (C->isAllOnes()) {
4261 Bits = 0xff;
4262 return true;
4263 }
4264 if (C->isZero()) {
4265 Bits = 0;
4266 return true;
4267 }
4268 }
4269
4270 for (unsigned I = 0; I < Src.size(); ++I) {
4271 // Try to find existing reused operand
4272 if (Src[I] == Op) {
4273 Bits = SrcBits[I];
4274 return true;
4275 }
4276 // Try to replace parent operator
4277 if (Src[I] == In) {
4278 Bits = SrcBits[I];
4279 Src[I] = Op;
4280 return true;
4281 }
4282 }
4283
4284 if (Src.size() == 3) {
4285 // No room left for operands. Try one last time, there can be a 'not' of
4286 // one of our source operands. In this case we can compute the bits
4287 // without growing Src vector.
4288 if (Op.getOpcode() == ISD::XOR) {
4289 if (auto *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
4290 if (C->isAllOnes()) {
4291 SDValue LHS = Op.getOperand(0);
4292 for (unsigned I = 0; I < Src.size(); ++I) {
4293 if (Src[I] == LHS) {
4294 Bits = ~SrcBits[I];
4295 return true;
4296 }
4297 }
4298 }
4299 }
4300 }
4301
4302 return false;
4303 }
4304
4305 Bits = SrcBits[Src.size()];
4306 Src.push_back(Op);
4307 return true;
4308 };
4309
4310 switch (In.getOpcode()) {
4311 case ISD::AND:
4312 case ISD::OR:
4313 case ISD::XOR: {
4314 SDValue LHS = In.getOperand(0);
4315 SDValue RHS = In.getOperand(1);
4316
4317 SmallVector<SDValue, 3> Backup(Src.begin(), Src.end());
4318 if (!getOperandBits(LHS, LHSBits) ||
4319 !getOperandBits(RHS, RHSBits)) {
4320 Src = std::move(Backup);
4321 return std::make_pair(0, 0);
4322 }
4323
4324 // Recursion is naturally limited by the size of the operand vector.
4325 auto Op = BitOp3_Op(LHS, Src);
4326 if (Op.first) {
4327 NumOpcodes += Op.first;
4328 LHSBits = Op.second;
4329 }
4330
4331 Op = BitOp3_Op(RHS, Src);
4332 if (Op.first) {
4333 NumOpcodes += Op.first;
4334 RHSBits = Op.second;
4335 }
4336 break;
4337 }
4338 default:
4339 return std::make_pair(0, 0);
4340 }
4341
4342 uint8_t TTbl;
4343 switch (In.getOpcode()) {
4344 case ISD::AND:
4345 TTbl = LHSBits & RHSBits;
4346 break;
4347 case ISD::OR:
4348 TTbl = LHSBits | RHSBits;
4349 break;
4350 case ISD::XOR:
4351 TTbl = LHSBits ^ RHSBits;
4352 break;
4353 default:
4354 break;
4355 }
4356
4357 return std::make_pair(NumOpcodes + 1, TTbl);
4358}
4359
4360bool AMDGPUDAGToDAGISel::SelectBITOP3(SDValue In, SDValue &Src0, SDValue &Src1,
4361 SDValue &Src2, SDValue &Tbl) const {
4363 uint8_t TTbl;
4364 unsigned NumOpcodes;
4365
4366 std::tie(NumOpcodes, TTbl) = BitOp3_Op(In, Src);
4367
4368 // Src.empty() case can happen if all operands are all zero or all ones.
4369 // Normally it shall be optimized out before reaching this.
4370 if (NumOpcodes < 2 || Src.empty())
4371 return false;
4372
4373 // For a uniform case threshold should be higher to account for moves between
4374 // VGPRs and SGPRs. It needs one operand in a VGPR, rest two can be in SGPRs
4375 // and a readtfirstlane after.
4376 if (NumOpcodes < 4 && !In->isDivergent())
4377 return false;
4378
4379 if (NumOpcodes == 2 && In.getValueType() == MVT::i32) {
4380 // Avoid using BITOP3 for OR3, XOR3, AND_OR. This is not faster but makes
4381 // asm more readable. This cannot be modeled with AddedComplexity because
4382 // selector does not know how many operations did we match.
4383 if ((In.getOpcode() == ISD::XOR || In.getOpcode() == ISD::OR) &&
4384 (In.getOperand(0).getOpcode() == In.getOpcode() ||
4385 In.getOperand(1).getOpcode() == In.getOpcode()))
4386 return false;
4387
4388 if (In.getOpcode() == ISD::OR &&
4389 (In.getOperand(0).getOpcode() == ISD::AND ||
4390 In.getOperand(1).getOpcode() == ISD::AND))
4391 return false;
4392 }
4393
4394 // Last operand can be ignored, turning a ternary operation into a binary.
4395 // For example: (~a & b & c) | (~a & b & ~c) -> (~a & b). We can replace
4396 // 'c' with 'a' here without changing the answer. In some pathological
4397 // cases it should be possible to get an operation with a single operand
4398 // too if optimizer would not catch it.
4399 while (Src.size() < 3)
4400 Src.push_back(Src[0]);
4401
4402 Src0 = Src[0];
4403 Src1 = Src[1];
4404 Src2 = Src[2];
4405
4406 Tbl = CurDAG->getTargetConstant(TTbl, SDLoc(In), MVT::i32);
4407 return true;
4408}
4409
4410SDValue AMDGPUDAGToDAGISel::getHi16Elt(SDValue In) const {
4411 if (In.isUndef())
4412 return CurDAG->getUNDEF(MVT::i32);
4413
4414 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(In)) {
4415 SDLoc SL(In);
4416 return CurDAG->getConstant(C->getZExtValue() << 16, SL, MVT::i32);
4417 }
4418
4419 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(In)) {
4420 SDLoc SL(In);
4421 return CurDAG->getConstant(
4422 C->getValueAPF().bitcastToAPInt().getZExtValue() << 16, SL, MVT::i32);
4423 }
4424
4425 SDValue Src;
4426 if (isExtractHiElt(In, Src))
4427 return Src;
4428
4429 return SDValue();
4430}
4431
4432bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
4433 assert(CurDAG->getTarget().getTargetTriple().isAMDGCN());
4434
4435 const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
4436 const SIInstrInfo *SII = Subtarget->getInstrInfo();
4437
4438 unsigned Limit = 0;
4439 bool AllUsesAcceptSReg = true;
4440 for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
4441 Limit < 10 && U != E; ++U, ++Limit) {
4442 const TargetRegisterClass *RC =
4443 getOperandRegClass(U->getUser(), U->getOperandNo());
4444
4445 // If the register class is unknown, it could be an unknown
4446 // register class that needs to be an SGPR, e.g. an inline asm
4447 // constraint
4448 if (!RC || SIRI->isSGPRClass(RC))
4449 return false;
4450
4451 if (RC != &AMDGPU::VS_32RegClass && RC != &AMDGPU::VS_64RegClass &&
4452 RC != &AMDGPU::VS_64_Align2RegClass) {
4453 AllUsesAcceptSReg = false;
4454 SDNode *User = U->getUser();
4455 if (User->isMachineOpcode()) {
4456 unsigned Opc = User->getMachineOpcode();
4457 const MCInstrDesc &Desc = SII->get(Opc);
4458 if (Desc.isCommutable()) {
4459 unsigned OpIdx = Desc.getNumDefs() + U->getOperandNo();
4460 unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex;
4461 if (SII->findCommutedOpIndices(Desc, OpIdx, CommuteIdx1)) {
4462 unsigned CommutedOpNo = CommuteIdx1 - Desc.getNumDefs();
4463 const TargetRegisterClass *CommutedRC =
4464 getOperandRegClass(U->getUser(), CommutedOpNo);
4465 if (CommutedRC == &AMDGPU::VS_32RegClass ||
4466 CommutedRC == &AMDGPU::VS_64RegClass ||
4467 CommutedRC == &AMDGPU::VS_64_Align2RegClass)
4468 AllUsesAcceptSReg = true;
4469 }
4470 }
4471 }
4472 // If "AllUsesAcceptSReg == false" so far we haven't succeeded
4473 // commuting current user. This means have at least one use
4474 // that strictly require VGPR. Thus, we will not attempt to commute
4475 // other user instructions.
4476 if (!AllUsesAcceptSReg)
4477 break;
4478 }
4479 }
4480 return !AllUsesAcceptSReg && (Limit < 10);
4481}
4482
4483bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode *N) const {
4484 const auto *Ld = cast<LoadSDNode>(N);
4485 const MachineMemOperand *MMO = Ld->getMemOperand();
4486
4487 // FIXME: We ought to able able to take the direct isDivergent result. We
4488 // cannot rely on the MMO for a uniformity check, and should stop using
4489 // it. This is a hack for 2 ways that the IR divergence analysis is superior
4490 // to the DAG divergence: Recognizing shift-of-workitem-id as always
4491 // uniform, and isSingleLaneExecution. These should be handled in the DAG
4492 // version, and then this can be dropped.
4493 if (Ld->isDivergent() && !AMDGPU::isUniformMMO(MMO))
4494 return false;
4495
4496 return MMO->getSize().hasValue() &&
4497 Ld->getAlign() >=
4498 Align(std::min(MMO->getSize().getValue().getKnownMinValue(),
4499 uint64_t(4))) &&
4500 (MMO->isInvariant() ||
4501 (Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
4502 Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ||
4503 (Subtarget->getScalarizeGlobalBehavior() &&
4504 Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
4505 Ld->isSimple() &&
4506 static_cast<const SITargetLowering *>(getTargetLowering())
4507 ->isMemOpHasNoClobberedMemOperand(N)));
4508}
4509
4512 *static_cast<const AMDGPUTargetLowering*>(getTargetLowering());
4513 bool IsModified = false;
4514 do {
4515 IsModified = false;
4516
4517 // Go over all selected nodes and try to fold them a bit more
4518 SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_begin();
4519 while (Position != CurDAG->allnodes_end()) {
4520 SDNode *Node = &*Position++;
4522 if (!MachineNode)
4523 continue;
4524
4525 SDNode *ResNode = Lowering.PostISelFolding(MachineNode, *CurDAG);
4526 if (ResNode != Node) {
4527 if (ResNode)
4528 ReplaceUses(Node, ResNode);
4529 IsModified = true;
4530 }
4531 }
4532 CurDAG->RemoveDeadNodes();
4533 } while (IsModified);
4534}
4535
4540
unsigned const MachineRegisterInfo * MRI
return SDValue()
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static bool getBaseWithOffsetUsingSplitOR(SelectionDAG &DAG, SDValue Addr, SDValue &N0, SDValue &N1)
static MachineSDNode * buildRegSequence32(SmallVectorImpl< SDValue > &Elts, llvm::SelectionDAG *CurDAG, const SDLoc &DL)
static SDValue SelectSAddrFI(SelectionDAG *CurDAG, SDValue SAddr)
static SDValue matchExtFromI32orI32(SDValue Op, bool IsSigned, const SelectionDAG *DAG)
static MemSDNode * findMemSDNode(SDNode *N)
static MachineSDNode * buildRegSequence16(SmallVectorImpl< SDValue > &Elts, llvm::SelectionDAG *CurDAG, const SDLoc &DL)
static bool IsCopyFromSGPR(const SIRegisterInfo &TRI, SDValue Val)
static SDValue combineBallotPattern(SDValue VCMP, bool &Negate)
static SDValue matchBF16FPExtendLike(SDValue Op, bool &IsExtractHigh)
static void checkWMMAElementsModifiersF16(BuildVectorSDNode *BV, std::function< bool(SDValue)> ModifierCheck)
Defines an instruction selector for the AMDGPU target.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In, Register &Out)
static std::pair< unsigned, uint8_t > BitOp3_Op(Register R, SmallVectorImpl< Register > &Src, const MachineRegisterInfo &MRI)
static unsigned gwsIntrinToOpcode(unsigned IntrID)
static Register buildRegSequence(SmallVectorImpl< Register > &Elts, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods, SmallVectorImpl< Register > &Elts, Register &Src, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
Provides AMDGPU specific target descriptions.
Base class for AMDGPU specific classes of TargetSubtarget.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
const HexagonInstrInfo * TII
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
MachineInstr unsigned OpIdx
FunctionAnalysisManager FAM
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
Provides R600 specific target descriptions.
Interface definition for R600RegisterInfo.
const SmallVectorImpl< MachineOperand > & Cond
SI DAG Lowering interface definition.
#define LLVM_DEBUG(...)
Definition Debug.h:114
LLVM IR instance of the generic uniformity analysis.
Value * RHS
Value * LHS
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
AMDGPUDAGToDAGISelLegacy(TargetMachine &TM, CodeGenOptLevel OptLevel)
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
StringRef getPassName() const override
getPassName - Return a nice clean name for a pass.
AMDGPU specific code to select AMDGPU machine instructions for SelectionDAG operations.
void SelectBuildVector(SDNode *N, unsigned RegClassID)
void Select(SDNode *N) override
Main hook for targets to transform nodes into machine nodes.
bool runOnMachineFunction(MachineFunction &MF) override
void PreprocessISelDAG() override
PreprocessISelDAG - This hook allows targets to hack on the graph before instruction selection starts...
void PostprocessISelDAG() override
PostprocessISelDAG() - This hook allows the target to hack on the graph right after selection.
bool matchLoadD16FromBuildVector(SDNode *N) const
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
AMDGPUISelDAGToDAGPass(TargetMachine &TM)
static SDValue stripBitcast(SDValue Val)
static int64_t getNullPointerValue(unsigned AddrSpace)
Get the integer value of a null pointer in the given address space.
static const fltSemantics & BFloat()
Definition APFloat.h:295
static const fltSemantics & IEEEhalf()
Definition APFloat.h:294
Class for arbitrary precision integers.
Definition APInt.h:78
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1555
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition APInt.h:467
bool isMaxSignedValue() const
Determine if this is the largest signed value.
Definition APInt.h:406
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1577
unsigned countr_one() const
Count the number of trailing one bits.
Definition APInt.h:1671
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
A "pseudo-class" with methods for operating on BUILD_VECTORs.
LLVM_ABI SDValue getSplatValue(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted value or a null value if this is not a splat.
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
int64_t getSExtValue() const
Analysis pass which computes a DominatorTree.
Definition Dominators.h:283
Legacy analysis pass which computes a DominatorTree.
Definition Dominators.h:321
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:164
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
const SIInstrInfo * getInstrInfo() const override
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
Generation getGeneration() const
void checkSubtargetFeatures(const Function &F) const
Diagnose inconsistent subtarget features before attempting to codegen function F.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
bool hasValue() const
TypeSize getValue() const
Analysis pass that exposes the LoopInfo for a function.
Definition LoopInfo.h:569
SmallVector< LoopT *, 4 > getLoopsInPreorder() const
Return all of the loops in the function in preorder across the loop nests, with siblings in forward p...
The legacy pass manager's analysis pass to compute loop information.
Definition LoopInfo.h:596
Machine Value Type.
static MVT getIntegerVT(unsigned BitWidth)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
LocationSize getSize() const
Return the size in bytes of the memory reference.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
MachineMemOperand * getMemOperand() const
Return the unique MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getChain() const
EVT getMemoryVT() const
Return the type of the in-memory value.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
Wrapper class representing virtual and physical registers.
Definition Register.h:20
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
const APInt & getAsAPIntVal() const
Helper method returns the APInt value of a ConstantSDNode.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool isDivergent() const
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumOperands() const
Return the number of values used by this operation.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
bool isPredecessorOf(const SDNode *N) const
Return true if this node is a predecessor of N.
bool isAnyAdd() const
Returns true if the node type is ADD or PTRADD.
static use_iterator use_end()
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
uint64_t getConstantOperandVal(unsigned i) const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx0, unsigned &SrcOpIdx1) const override
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isSGPRClass(const TargetRegisterClass *RC)
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
SelectionDAGISelLegacy(char &ID, std::unique_ptr< SelectionDAGISel > S)
SelectionDAGISelPass(std::unique_ptr< SelectionDAGISel > Selector)
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
std::unique_ptr< FunctionLoweringInfo > FuncInfo
const TargetLowering * TLI
const TargetInstrInfo * TII
void ReplaceUses(SDValue F, SDValue T)
ReplaceUses - replace all uses of the old node F with the use of the new node T.
void ReplaceNode(SDNode *F, SDNode *T)
Replace all uses of F with T, then remove F from the DAG.
SelectionDAGISel(TargetMachine &tm, CodeGenOptLevel OL=CodeGenOptLevel::Default)
virtual bool runOnMachineFunction(MachineFunction &mf)
const TargetLowering * getTargetLowering() const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
SDValue getTargetFrameIndex(int FI, EVT VT)
LLVM_ABI bool SignBitIsZero(SDValue Op, unsigned Depth=0) const
Return true if the sign bit of Op is known to be zero.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
MachineFunction & getMachineFunction() const
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
ilist< SDNode >::iterator allnodes_iterator
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
static const unsigned CommuteAnyOperandIndex
Primary interface to the complete machine description for the target machine.
unsigned getID() const
Return the register class ID number.
Legacy analysis pass which computes a CycleInfo.
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
std::optional< int64_t > getSMRDEncodedLiteralOffset32(const MCSubtargetInfo &ST, int64_t ByteOffset)
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isValid32BitLiteral(uint64_t Val, bool IsFP64)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST)
std::optional< int64_t > getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset, bool IsBuffer, bool HasSOffset)
bool isUniformMMO(const MachineMemOperand *MMO)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:819
@ STACKRESTORE
STACKRESTORE has two operands, an input chain and a pointer to restore to it returns an output chain.
@ PTRADD
PTRADD represents pointer arithmetic semantics, for targets that opt in using shouldPreservePtrArith(...
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:275
@ ADDC
Carry-setting nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:294
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition ISDOpcodes.h:522
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:853
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:518
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:220
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:880
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:417
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition ISDOpcodes.h:993
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:254
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
@ CONVERGENCECTRL_GLUE
This does not correspond to any convergence control intrinsic.
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:844
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition ISDOpcodes.h:665
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition ISDOpcodes.h:541
@ UNDEF
UNDEF - An undefined node.
Definition ISDOpcodes.h:233
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:230
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:765
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:649
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:576
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:224
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:850
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum maximum on two values, following IEEE-754 definition...
@ TargetFrameIndex
Definition ISDOpcodes.h:187
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:888
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:978
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:328
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:739
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:205
@ ADDE
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:304
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:959
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:856
@ BRCOND
BRCOND - Conditional branch.
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:213
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:556
bool isExtOpcode(unsigned Opcode)
LLVM_ABI bool isBuildVectorAllZeros(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are 0 or undef.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
@ User
could "use" a pointer
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
@ Offset
Definition DWP.cpp:532
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
@ Undef
Value of the register doesn't matter.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
constexpr bool isMask_32(uint32_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition MathExtras.h:255
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
Op::Description Desc
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
static bool getConstantValue(SDValue N, uint32_t &Out)
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
FunctionPass * createAMDGPUISelDag(TargetMachine &TM, CodeGenOptLevel OptLevel)
This pass converts a legalized DAG into a AMDGPU-specific.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition VE.h:376
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
Implement std::hash so that hash_code can be used in STL containers.
Definition BitVector.h:870
#define N
Extended Value Type.
Definition ValueTypes.h:35
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition ValueTypes.h:256
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:328
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:157
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition KnownBits.h:317
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:363
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition KnownBits.h:148
APInt getMinValue() const
Return the minimal unsigned value possible given these KnownBits.
Definition KnownBits.h:132
static unsigned getSubRegFromChannel(unsigned Channel)
bool hasNoUnsignedWrap() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.