LLVM 20.0.0git
AMDGPUISelDAGToDAG.cpp
Go to the documentation of this file.
1//===-- AMDGPUISelDAGToDAG.cpp - A dag to dag inst selector for AMDGPU ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//==-----------------------------------------------------------------------===//
8//
9/// \file
10/// Defines an instruction selector for the AMDGPU target.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPUISelDAGToDAG.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPUSubtarget.h"
18#include "AMDGPUTargetMachine.h"
21#include "R600RegisterInfo.h"
22#include "SIISelLowering.h"
29#include "llvm/IR/IntrinsicsAMDGPU.h"
31
32#ifdef EXPENSIVE_CHECKS
34#include "llvm/IR/Dominators.h"
35#endif
36
37#define DEBUG_TYPE "amdgpu-isel"
38
39using namespace llvm;
40
41//===----------------------------------------------------------------------===//
42// Instruction Selector Implementation
43//===----------------------------------------------------------------------===//
44
45namespace {
46static SDValue stripBitcast(SDValue Val) {
47 return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val;
48}
49
50// Figure out if this is really an extract of the high 16-bits of a dword.
51static bool isExtractHiElt(SDValue In, SDValue &Out) {
52 In = stripBitcast(In);
53
54 if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
55 if (ConstantSDNode *Idx = dyn_cast<ConstantSDNode>(In.getOperand(1))) {
56 if (!Idx->isOne())
57 return false;
58 Out = In.getOperand(0);
59 return true;
60 }
61 }
62
63 if (In.getOpcode() != ISD::TRUNCATE)
64 return false;
65
66 SDValue Srl = In.getOperand(0);
67 if (Srl.getOpcode() == ISD::SRL) {
68 if (ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
69 if (ShiftAmt->getZExtValue() == 16) {
70 Out = stripBitcast(Srl.getOperand(0));
71 return true;
72 }
73 }
74 }
75
76 return false;
77}
78
79// Look through operations that obscure just looking at the low 16-bits of the
80// same register.
81static SDValue stripExtractLoElt(SDValue In) {
82 if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
83 SDValue Idx = In.getOperand(1);
84 if (isNullConstant(Idx) && In.getValueSizeInBits() <= 32)
85 return In.getOperand(0);
86 }
87
88 if (In.getOpcode() == ISD::TRUNCATE) {
89 SDValue Src = In.getOperand(0);
90 if (Src.getValueType().getSizeInBits() == 32)
91 return stripBitcast(Src);
92 }
93
94 return In;
95}
96
97} // end anonymous namespace
98
100 "AMDGPU DAG->DAG Pattern Instruction Selection", false,
101 false)
103INITIALIZE_PASS_DEPENDENCY(AMDGPUPerfHintAnalysisLegacy)
105#ifdef EXPENSIVE_CHECKS
108#endif
110 "AMDGPU DAG->DAG Pattern Instruction Selection", false,
111 false)
112
113/// This pass converts a legalized DAG into a AMDGPU-specific
114// DAG, ready for instruction scheduling.
116 CodeGenOptLevel OptLevel) {
117 return new AMDGPUDAGToDAGISelLegacy(TM, OptLevel);
118}
119
121 CodeGenOptLevel OptLevel)
122 : SelectionDAGISel(TM, OptLevel) {}
123
125 Subtarget = &MF.getSubtarget<GCNSubtarget>();
127 Mode = SIModeRegisterDefaults(MF.getFunction(), *Subtarget);
129}
130
131bool AMDGPUDAGToDAGISel::fp16SrcZerosHighBits(unsigned Opc) const {
132 // XXX - only need to list legal operations.
133 switch (Opc) {
134 case ISD::FADD:
135 case ISD::FSUB:
136 case ISD::FMUL:
137 case ISD::FDIV:
138 case ISD::FREM:
140 case ISD::UINT_TO_FP:
141 case ISD::SINT_TO_FP:
142 case ISD::FABS:
143 // Fabs is lowered to a bit operation, but it's an and which will clear the
144 // high bits anyway.
145 case ISD::FSQRT:
146 case ISD::FSIN:
147 case ISD::FCOS:
148 case ISD::FPOWI:
149 case ISD::FPOW:
150 case ISD::FLOG:
151 case ISD::FLOG2:
152 case ISD::FLOG10:
153 case ISD::FEXP:
154 case ISD::FEXP2:
155 case ISD::FCEIL:
156 case ISD::FTRUNC:
157 case ISD::FRINT:
158 case ISD::FNEARBYINT:
159 case ISD::FROUNDEVEN:
160 case ISD::FROUND:
161 case ISD::FFLOOR:
162 case ISD::FMINNUM:
163 case ISD::FMAXNUM:
164 case ISD::FLDEXP:
165 case AMDGPUISD::FRACT:
166 case AMDGPUISD::CLAMP:
169 case AMDGPUISD::FMIN3:
170 case AMDGPUISD::FMAX3:
171 case AMDGPUISD::FMED3:
173 case AMDGPUISD::RCP:
174 case AMDGPUISD::RSQ:
176 // On gfx10, all 16-bit instructions preserve the high bits.
177 return Subtarget->getGeneration() <= AMDGPUSubtarget::GFX9;
178 case ISD::FP_ROUND:
179 // We may select fptrunc (fma/mad) to mad_mixlo, which does not zero the
180 // high bits on gfx9.
181 // TODO: If we had the source node we could see if the source was fma/mad
183 case ISD::FMA:
184 case ISD::FMAD:
187 default:
188 // fcopysign, select and others may be lowered to 32-bit bit operations
189 // which don't zero the high bits.
190 return false;
191 }
192}
193
195#ifdef EXPENSIVE_CHECKS
196 DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
197 LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
198 for (auto &L : LI->getLoopsInPreorder()) {
199 assert(L->isLCSSAForm(DT));
200 }
201#endif
203}
204
208#ifdef EXPENSIVE_CHECKS
211#endif
213}
214
216 assert(Subtarget->d16PreservesUnusedBits());
217 MVT VT = N->getValueType(0).getSimpleVT();
218 if (VT != MVT::v2i16 && VT != MVT::v2f16)
219 return false;
220
221 SDValue Lo = N->getOperand(0);
222 SDValue Hi = N->getOperand(1);
223
224 LoadSDNode *LdHi = dyn_cast<LoadSDNode>(stripBitcast(Hi));
225
226 // build_vector lo, (load ptr) -> load_d16_hi ptr, lo
227 // build_vector lo, (zextload ptr from i8) -> load_d16_hi_u8 ptr, lo
228 // build_vector lo, (sextload ptr from i8) -> load_d16_hi_i8 ptr, lo
229
230 // Need to check for possible indirect dependencies on the other half of the
231 // vector to avoid introducing a cycle.
232 if (LdHi && Hi.hasOneUse() && !LdHi->isPredecessorOf(Lo.getNode())) {
233 SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
234
236 SDValue Ops[] = {
237 LdHi->getChain(), LdHi->getBasePtr(), TiedIn
238 };
239
240 unsigned LoadOp = AMDGPUISD::LOAD_D16_HI;
241 if (LdHi->getMemoryVT() == MVT::i8) {
242 LoadOp = LdHi->getExtensionType() == ISD::SEXTLOAD ?
244 } else {
245 assert(LdHi->getMemoryVT() == MVT::i16);
246 }
247
248 SDValue NewLoadHi =
249 CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdHi), VTList,
250 Ops, LdHi->getMemoryVT(),
251 LdHi->getMemOperand());
252
253 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadHi);
254 CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdHi, 1), NewLoadHi.getValue(1));
255 return true;
256 }
257
258 // build_vector (load ptr), hi -> load_d16_lo ptr, hi
259 // build_vector (zextload ptr from i8), hi -> load_d16_lo_u8 ptr, hi
260 // build_vector (sextload ptr from i8), hi -> load_d16_lo_i8 ptr, hi
261 LoadSDNode *LdLo = dyn_cast<LoadSDNode>(stripBitcast(Lo));
262 if (LdLo && Lo.hasOneUse()) {
263 SDValue TiedIn = getHi16Elt(Hi);
264 if (!TiedIn || LdLo->isPredecessorOf(TiedIn.getNode()))
265 return false;
266
267 SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
268 unsigned LoadOp = AMDGPUISD::LOAD_D16_LO;
269 if (LdLo->getMemoryVT() == MVT::i8) {
270 LoadOp = LdLo->getExtensionType() == ISD::SEXTLOAD ?
272 } else {
273 assert(LdLo->getMemoryVT() == MVT::i16);
274 }
275
276 TiedIn = CurDAG->getNode(ISD::BITCAST, SDLoc(N), VT, TiedIn);
277
278 SDValue Ops[] = {
279 LdLo->getChain(), LdLo->getBasePtr(), TiedIn
280 };
281
282 SDValue NewLoadLo =
283 CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdLo), VTList,
284 Ops, LdLo->getMemoryVT(),
285 LdLo->getMemOperand());
286
287 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadLo);
288 CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdLo, 1), NewLoadLo.getValue(1));
289 return true;
290 }
291
292 return false;
293}
294
296 if (!Subtarget->d16PreservesUnusedBits())
297 return;
298
300
301 bool MadeChange = false;
302 while (Position != CurDAG->allnodes_begin()) {
303 SDNode *N = &*--Position;
304 if (N->use_empty())
305 continue;
306
307 switch (N->getOpcode()) {
309 // TODO: Match load d16 from shl (extload:i16), 16
310 MadeChange |= matchLoadD16FromBuildVector(N);
311 break;
312 default:
313 break;
314 }
315 }
316
317 if (MadeChange) {
319 LLVM_DEBUG(dbgs() << "After PreProcess:\n";
320 CurDAG->dump(););
321 }
322}
323
324bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N) const {
325 if (N->isUndef())
326 return true;
327
328 const SIInstrInfo *TII = Subtarget->getInstrInfo();
329 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N))
330 return TII->isInlineConstant(C->getAPIntValue());
331
332 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N))
333 return TII->isInlineConstant(C->getValueAPF());
334
335 return false;
336}
337
338/// Determine the register class for \p OpNo
339/// \returns The register class of the virtual register that will be used for
340/// the given operand number \OpNo or NULL if the register class cannot be
341/// determined.
342const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
343 unsigned OpNo) const {
344 if (!N->isMachineOpcode()) {
345 if (N->getOpcode() == ISD::CopyToReg) {
346 Register Reg = cast<RegisterSDNode>(N->getOperand(1))->getReg();
347 if (Reg.isVirtual()) {
349 return MRI.getRegClass(Reg);
350 }
351
352 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
353 return TRI->getPhysRegBaseClass(Reg);
354 }
355
356 return nullptr;
357 }
358
359 switch (N->getMachineOpcode()) {
360 default: {
361 const MCInstrDesc &Desc =
362 Subtarget->getInstrInfo()->get(N->getMachineOpcode());
363 unsigned OpIdx = Desc.getNumDefs() + OpNo;
364 if (OpIdx >= Desc.getNumOperands())
365 return nullptr;
366 int RegClass = Desc.operands()[OpIdx].RegClass;
367 if (RegClass == -1)
368 return nullptr;
369
370 return Subtarget->getRegisterInfo()->getRegClass(RegClass);
371 }
372 case AMDGPU::REG_SEQUENCE: {
373 unsigned RCID = N->getConstantOperandVal(0);
374 const TargetRegisterClass *SuperRC =
375 Subtarget->getRegisterInfo()->getRegClass(RCID);
376
377 SDValue SubRegOp = N->getOperand(OpNo + 1);
378 unsigned SubRegIdx = SubRegOp->getAsZExtVal();
379 return Subtarget->getRegisterInfo()->getSubClassWithSubReg(SuperRC,
380 SubRegIdx);
381 }
382 }
383}
384
385SDNode *AMDGPUDAGToDAGISel::glueCopyToOp(SDNode *N, SDValue NewChain,
386 SDValue Glue) const {
387 SmallVector <SDValue, 8> Ops;
388 Ops.push_back(NewChain); // Replace the chain.
389 for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
390 Ops.push_back(N->getOperand(i));
391
392 Ops.push_back(Glue);
393 return CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops);
394}
395
396SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N, SDValue Val) const {
398 *static_cast<const SITargetLowering*>(getTargetLowering());
399
400 assert(N->getOperand(0).getValueType() == MVT::Other && "Expected chain");
401
402 SDValue M0 = Lowering.copyToM0(*CurDAG, N->getOperand(0), SDLoc(N), Val);
403 return glueCopyToOp(N, M0, M0.getValue(1));
404}
405
406SDNode *AMDGPUDAGToDAGISel::glueCopyToM0LDSInit(SDNode *N) const {
407 unsigned AS = cast<MemSDNode>(N)->getAddressSpace();
408 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
409 if (Subtarget->ldsRequiresM0Init())
410 return glueCopyToM0(
411 N, CurDAG->getSignedTargetConstant(-1, SDLoc(N), MVT::i32));
412 } else if (AS == AMDGPUAS::REGION_ADDRESS) {
414 unsigned Value = MF.getInfo<SIMachineFunctionInfo>()->getGDSSize();
415 return
416 glueCopyToM0(N, CurDAG->getTargetConstant(Value, SDLoc(N), MVT::i32));
417 }
418 return N;
419}
420
421MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm,
422 EVT VT) const {
424 AMDGPU::S_MOV_B32, DL, MVT::i32,
425 CurDAG->getTargetConstant(Lo_32(Imm), DL, MVT::i32));
427 AMDGPU::S_MOV_B32, DL, MVT::i32,
428 CurDAG->getTargetConstant(Hi_32(Imm), DL, MVT::i32));
429 const SDValue Ops[] = {
430 CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
431 SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
432 SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
433
434 return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, VT, Ops);
435}
436
437void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
438 EVT VT = N->getValueType(0);
439 unsigned NumVectorElts = VT.getVectorNumElements();
440 EVT EltVT = VT.getVectorElementType();
441 SDLoc DL(N);
442 SDValue RegClass = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
443
444 if (NumVectorElts == 1) {
445 CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT, N->getOperand(0),
446 RegClass);
447 return;
448 }
449
450 assert(NumVectorElts <= 32 && "Vectors with more than 32 elements not "
451 "supported yet");
452 // 32 = Max Num Vector Elements
453 // 2 = 2 REG_SEQUENCE operands per element (value, subreg index)
454 // 1 = Vector Register Class
455 SmallVector<SDValue, 32 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1);
456
457 bool IsGCN = CurDAG->getSubtarget().getTargetTriple().getArch() ==
459 RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
460 bool IsRegSeq = true;
461 unsigned NOps = N->getNumOperands();
462 for (unsigned i = 0; i < NOps; i++) {
463 // XXX: Why is this here?
464 if (isa<RegisterSDNode>(N->getOperand(i))) {
465 IsRegSeq = false;
466 break;
467 }
468 unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
470 RegSeqArgs[1 + (2 * i)] = N->getOperand(i);
471 RegSeqArgs[1 + (2 * i) + 1] = CurDAG->getTargetConstant(Sub, DL, MVT::i32);
472 }
473 if (NOps != NumVectorElts) {
474 // Fill in the missing undef elements if this was a scalar_to_vector.
475 assert(N->getOpcode() == ISD::SCALAR_TO_VECTOR && NOps < NumVectorElts);
476 MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
477 DL, EltVT);
478 for (unsigned i = NOps; i < NumVectorElts; ++i) {
479 unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
481 RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0);
482 RegSeqArgs[1 + (2 * i) + 1] =
483 CurDAG->getTargetConstant(Sub, DL, MVT::i32);
484 }
485 }
486
487 if (!IsRegSeq)
488 SelectCode(N);
489 CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), RegSeqArgs);
490}
491
493 unsigned int Opc = N->getOpcode();
494 if (N->isMachineOpcode()) {
495 N->setNodeId(-1);
496 return; // Already selected.
497 }
498
499 // isa<MemSDNode> almost works but is slightly too permissive for some DS
500 // intrinsics.
501 if (Opc == ISD::LOAD || Opc == ISD::STORE || isa<AtomicSDNode>(N)) {
502 N = glueCopyToM0LDSInit(N);
503 SelectCode(N);
504 return;
505 }
506
507 switch (Opc) {
508 default:
509 break;
510 // We are selecting i64 ADD here instead of custom lower it during
511 // DAG legalization, so we can fold some i64 ADDs used for address
512 // calculation into the LOAD and STORE instructions.
513 case ISD::ADDC:
514 case ISD::ADDE:
515 case ISD::SUBC:
516 case ISD::SUBE: {
517 if (N->getValueType(0) != MVT::i64)
518 break;
519
520 SelectADD_SUB_I64(N);
521 return;
522 }
523 case ISD::UADDO_CARRY:
524 case ISD::USUBO_CARRY:
525 if (N->getValueType(0) != MVT::i32)
526 break;
527
528 SelectAddcSubb(N);
529 return;
530 case ISD::UADDO:
531 case ISD::USUBO: {
532 SelectUADDO_USUBO(N);
533 return;
534 }
536 SelectFMUL_W_CHAIN(N);
537 return;
538 }
540 SelectFMA_W_CHAIN(N);
541 return;
542 }
543
545 case ISD::BUILD_VECTOR: {
546 EVT VT = N->getValueType(0);
547 unsigned NumVectorElts = VT.getVectorNumElements();
548 if (VT.getScalarSizeInBits() == 16) {
549 if (Opc == ISD::BUILD_VECTOR && NumVectorElts == 2) {
550 if (SDNode *Packed = packConstantV2I16(N, *CurDAG)) {
551 ReplaceNode(N, Packed);
552 return;
553 }
554 }
555
556 break;
557 }
558
559 assert(VT.getVectorElementType().bitsEq(MVT::i32));
560 unsigned RegClassID =
561 SIRegisterInfo::getSGPRClassForBitWidth(NumVectorElts * 32)->getID();
562 SelectBuildVector(N, RegClassID);
563 return;
564 }
565 case ISD::BUILD_PAIR: {
566 SDValue RC, SubReg0, SubReg1;
567 SDLoc DL(N);
568 if (N->getValueType(0) == MVT::i128) {
569 RC = CurDAG->getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32);
570 SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32);
571 SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32);
572 } else if (N->getValueType(0) == MVT::i64) {
573 RC = CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32);
574 SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
575 SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
576 } else {
577 llvm_unreachable("Unhandled value type for BUILD_PAIR");
578 }
579 const SDValue Ops[] = { RC, N->getOperand(0), SubReg0,
580 N->getOperand(1), SubReg1 };
581 ReplaceNode(N, CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL,
582 N->getValueType(0), Ops));
583 return;
584 }
585
586 case ISD::Constant:
587 case ISD::ConstantFP: {
588 if (N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N))
589 break;
590
591 uint64_t Imm;
592 if (ConstantFPSDNode *FP = dyn_cast<ConstantFPSDNode>(N)) {
593 Imm = FP->getValueAPF().bitcastToAPInt().getZExtValue();
594 if (AMDGPU::isValid32BitLiteral(Imm, true))
595 break;
596 } else {
597 ConstantSDNode *C = cast<ConstantSDNode>(N);
598 Imm = C->getZExtValue();
599 if (AMDGPU::isValid32BitLiteral(Imm, false))
600 break;
601 }
602
603 SDLoc DL(N);
604 ReplaceNode(N, buildSMovImm64(DL, Imm, N->getValueType(0)));
605 return;
606 }
608 case AMDGPUISD::BFE_U32: {
609 // There is a scalar version available, but unlike the vector version which
610 // has a separate operand for the offset and width, the scalar version packs
611 // the width and offset into a single operand. Try to move to the scalar
612 // version if the offsets are constant, so that we can try to keep extended
613 // loads of kernel arguments in SGPRs.
614
615 // TODO: Technically we could try to pattern match scalar bitshifts of
616 // dynamic values, but it's probably not useful.
617 ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
618 if (!Offset)
619 break;
620
621 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
622 if (!Width)
623 break;
624
625 bool Signed = Opc == AMDGPUISD::BFE_I32;
626
627 uint32_t OffsetVal = Offset->getZExtValue();
628 uint32_t WidthVal = Width->getZExtValue();
629
630 ReplaceNode(N, getBFE32(Signed, SDLoc(N), N->getOperand(0), OffsetVal,
631 WidthVal));
632 return;
633 }
635 SelectDIV_SCALE(N);
636 return;
637 }
640 SelectMAD_64_32(N);
641 return;
642 }
643 case ISD::SMUL_LOHI:
644 case ISD::UMUL_LOHI:
645 return SelectMUL_LOHI(N);
646 case ISD::CopyToReg: {
648 *static_cast<const SITargetLowering*>(getTargetLowering());
649 N = Lowering.legalizeTargetIndependentNode(N, *CurDAG);
650 break;
651 }
652 case ISD::AND:
653 case ISD::SRL:
654 case ISD::SRA:
656 if (N->getValueType(0) != MVT::i32)
657 break;
658
659 SelectS_BFE(N);
660 return;
661 case ISD::BRCOND:
662 SelectBRCOND(N);
663 return;
664 case ISD::FP_EXTEND:
665 SelectFP_EXTEND(N);
666 return;
672 // Hack around using a legal type if f16 is illegal.
673 if (N->getValueType(0) == MVT::i32) {
674 MVT NewVT = Opc == AMDGPUISD::CVT_PKRTZ_F16_F32 ? MVT::v2f16 : MVT::v2i16;
675 N = CurDAG->MorphNodeTo(N, N->getOpcode(), CurDAG->getVTList(NewVT),
676 { N->getOperand(0), N->getOperand(1) });
677 SelectCode(N);
678 return;
679 }
680
681 break;
682 }
684 SelectINTRINSIC_W_CHAIN(N);
685 return;
686 }
688 SelectINTRINSIC_WO_CHAIN(N);
689 return;
690 }
691 case ISD::INTRINSIC_VOID: {
692 SelectINTRINSIC_VOID(N);
693 return;
694 }
696 SelectWAVE_ADDRESS(N);
697 return;
698 }
699 case ISD::STACKRESTORE: {
700 SelectSTACKRESTORE(N);
701 return;
702 }
703 }
704
705 SelectCode(N);
706}
707
708bool AMDGPUDAGToDAGISel::isUniformBr(const SDNode *N) const {
709 const BasicBlock *BB = FuncInfo->MBB->getBasicBlock();
710 const Instruction *Term = BB->getTerminator();
711 return Term->getMetadata("amdgpu.uniform") ||
712 Term->getMetadata("structurizecfg.uniform");
713}
714
715bool AMDGPUDAGToDAGISel::isUnneededShiftMask(const SDNode *N,
716 unsigned ShAmtBits) const {
717 assert(N->getOpcode() == ISD::AND);
718
719 const APInt &RHS = N->getConstantOperandAPInt(1);
720 if (RHS.countr_one() >= ShAmtBits)
721 return true;
722
723 const APInt &LHSKnownZeros = CurDAG->computeKnownBits(N->getOperand(0)).Zero;
724 return (LHSKnownZeros | RHS).countr_one() >= ShAmtBits;
725}
726
728 SDValue &N0, SDValue &N1) {
729 if (Addr.getValueType() == MVT::i64 && Addr.getOpcode() == ISD::BITCAST &&
730 Addr.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
731 // As we split 64-bit `or` earlier, it's complicated pattern to match, i.e.
732 // (i64 (bitcast (v2i32 (build_vector
733 // (or (extract_vector_elt V, 0), OFFSET),
734 // (extract_vector_elt V, 1)))))
735 SDValue Lo = Addr.getOperand(0).getOperand(0);
736 if (Lo.getOpcode() == ISD::OR && DAG.isBaseWithConstantOffset(Lo)) {
737 SDValue BaseLo = Lo.getOperand(0);
738 SDValue BaseHi = Addr.getOperand(0).getOperand(1);
739 // Check that split base (Lo and Hi) are extracted from the same one.
740 if (BaseLo.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
742 BaseLo.getOperand(0) == BaseHi.getOperand(0) &&
743 // Lo is statically extracted from index 0.
744 isa<ConstantSDNode>(BaseLo.getOperand(1)) &&
745 BaseLo.getConstantOperandVal(1) == 0 &&
746 // Hi is statically extracted from index 0.
747 isa<ConstantSDNode>(BaseHi.getOperand(1)) &&
748 BaseHi.getConstantOperandVal(1) == 1) {
749 N0 = BaseLo.getOperand(0).getOperand(0);
750 N1 = Lo.getOperand(1);
751 return true;
752 }
753 }
754 }
755 return false;
756}
757
758bool AMDGPUDAGToDAGISel::isBaseWithConstantOffset64(SDValue Addr, SDValue &LHS,
759 SDValue &RHS) const {
761 LHS = Addr.getOperand(0);
762 RHS = Addr.getOperand(1);
763 return true;
764 }
765
766 if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, LHS, RHS)) {
767 assert(LHS && RHS && isa<ConstantSDNode>(RHS));
768 return true;
769 }
770
771 return false;
772}
773
775 return "AMDGPU DAG->DAG Pattern Instruction Selection";
776}
777
780 std::make_unique<AMDGPUDAGToDAGISel>(TM, TM.getOptLevel())) {}
781
785#ifdef EXPENSIVE_CHECKS
787 .getManager();
788 auto &F = MF.getFunction();
791 for (auto &L : LI.getLoopsInPreorder())
792 assert(L->isLCSSAForm(DT) && "Loop is not in LCSSA form!");
793#endif
794 return SelectionDAGISelPass::run(MF, MFAM);
795}
796
797//===----------------------------------------------------------------------===//
798// Complex Patterns
799//===----------------------------------------------------------------------===//
800
801bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
802 SDValue &Offset) {
803 return false;
804}
805
806bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
807 SDValue &Offset) {
809 SDLoc DL(Addr);
810
811 if ((C = dyn_cast<ConstantSDNode>(Addr))) {
812 Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
813 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
814 } else if ((Addr.getOpcode() == AMDGPUISD::DWORDADDR) &&
815 (C = dyn_cast<ConstantSDNode>(Addr.getOperand(0)))) {
816 Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
817 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
818 } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
819 (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) {
820 Base = Addr.getOperand(0);
821 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
822 } else {
823 Base = Addr;
824 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
825 }
826
827 return true;
828}
829
830SDValue AMDGPUDAGToDAGISel::getMaterializedScalarImm32(int64_t Val,
831 const SDLoc &DL) const {
833 AMDGPU::S_MOV_B32, DL, MVT::i32,
834 CurDAG->getTargetConstant(Val, DL, MVT::i32));
835 return SDValue(Mov, 0);
836}
837
838// FIXME: Should only handle uaddo_carry/usubo_carry
839void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
840 SDLoc DL(N);
841 SDValue LHS = N->getOperand(0);
842 SDValue RHS = N->getOperand(1);
843
844 unsigned Opcode = N->getOpcode();
845 bool ConsumeCarry = (Opcode == ISD::ADDE || Opcode == ISD::SUBE);
846 bool ProduceCarry =
847 ConsumeCarry || Opcode == ISD::ADDC || Opcode == ISD::SUBC;
848 bool IsAdd = Opcode == ISD::ADD || Opcode == ISD::ADDC || Opcode == ISD::ADDE;
849
850 SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
851 SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
852
853 SDNode *Lo0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
854 DL, MVT::i32, LHS, Sub0);
855 SDNode *Hi0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
856 DL, MVT::i32, LHS, Sub1);
857
858 SDNode *Lo1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
859 DL, MVT::i32, RHS, Sub0);
860 SDNode *Hi1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
861 DL, MVT::i32, RHS, Sub1);
862
863 SDVTList VTList = CurDAG->getVTList(MVT::i32, MVT::Glue);
864
865 static const unsigned OpcMap[2][2][2] = {
866 {{AMDGPU::S_SUB_U32, AMDGPU::S_ADD_U32},
867 {AMDGPU::V_SUB_CO_U32_e32, AMDGPU::V_ADD_CO_U32_e32}},
868 {{AMDGPU::S_SUBB_U32, AMDGPU::S_ADDC_U32},
869 {AMDGPU::V_SUBB_U32_e32, AMDGPU::V_ADDC_U32_e32}}};
870
871 unsigned Opc = OpcMap[0][N->isDivergent()][IsAdd];
872 unsigned CarryOpc = OpcMap[1][N->isDivergent()][IsAdd];
873
874 SDNode *AddLo;
875 if (!ConsumeCarry) {
876 SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0) };
877 AddLo = CurDAG->getMachineNode(Opc, DL, VTList, Args);
878 } else {
879 SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0), N->getOperand(2) };
880 AddLo = CurDAG->getMachineNode(CarryOpc, DL, VTList, Args);
881 }
882 SDValue AddHiArgs[] = {
883 SDValue(Hi0, 0),
884 SDValue(Hi1, 0),
885 SDValue(AddLo, 1)
886 };
887 SDNode *AddHi = CurDAG->getMachineNode(CarryOpc, DL, VTList, AddHiArgs);
888
889 SDValue RegSequenceArgs[] = {
890 CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
891 SDValue(AddLo,0),
892 Sub0,
893 SDValue(AddHi,0),
894 Sub1,
895 };
896 SDNode *RegSequence = CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
897 MVT::i64, RegSequenceArgs);
898
899 if (ProduceCarry) {
900 // Replace the carry-use
901 ReplaceUses(SDValue(N, 1), SDValue(AddHi, 1));
902 }
903
904 // Replace the remaining uses.
905 ReplaceNode(N, RegSequence);
906}
907
908void AMDGPUDAGToDAGISel::SelectAddcSubb(SDNode *N) {
909 SDLoc DL(N);
910 SDValue LHS = N->getOperand(0);
911 SDValue RHS = N->getOperand(1);
912 SDValue CI = N->getOperand(2);
913
914 if (N->isDivergent()) {
915 unsigned Opc = N->getOpcode() == ISD::UADDO_CARRY ? AMDGPU::V_ADDC_U32_e64
916 : AMDGPU::V_SUBB_U32_e64;
918 N, Opc, N->getVTList(),
919 {LHS, RHS, CI,
920 CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
921 } else {
922 unsigned Opc = N->getOpcode() == ISD::UADDO_CARRY ? AMDGPU::S_ADD_CO_PSEUDO
923 : AMDGPU::S_SUB_CO_PSEUDO;
924 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), {LHS, RHS, CI});
925 }
926}
927
928void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) {
929 // The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned
930 // carry out despite the _i32 name. These were renamed in VI to _U32.
931 // FIXME: We should probably rename the opcodes here.
932 bool IsAdd = N->getOpcode() == ISD::UADDO;
933 bool IsVALU = N->isDivergent();
934
935 for (SDNode::user_iterator UI = N->user_begin(), E = N->user_end(); UI != E;
936 ++UI)
937 if (UI.getUse().getResNo() == 1) {
938 if ((IsAdd && (UI->getOpcode() != ISD::UADDO_CARRY)) ||
939 (!IsAdd && (UI->getOpcode() != ISD::USUBO_CARRY))) {
940 IsVALU = true;
941 break;
942 }
943 }
944
945 if (IsVALU) {
946 unsigned Opc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
947
949 N, Opc, N->getVTList(),
950 {N->getOperand(0), N->getOperand(1),
951 CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
952 } else {
953 unsigned Opc = N->getOpcode() == ISD::UADDO ? AMDGPU::S_UADDO_PSEUDO
954 : AMDGPU::S_USUBO_PSEUDO;
955
956 CurDAG->SelectNodeTo(N, Opc, N->getVTList(),
957 {N->getOperand(0), N->getOperand(1)});
958 }
959}
960
961void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) {
962 SDLoc SL(N);
963 // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp, omod
964 SDValue Ops[10];
965
966 SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[6], Ops[7]);
967 SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
968 SelectVOP3Mods(N->getOperand(3), Ops[5], Ops[4]);
969 Ops[8] = N->getOperand(0);
970 Ops[9] = N->getOperand(4);
971
972 // If there are no source modifiers, prefer fmac over fma because it can use
973 // the smaller VOP2 encoding.
974 bool UseFMAC = Subtarget->hasDLInsts() &&
975 cast<ConstantSDNode>(Ops[0])->isZero() &&
976 cast<ConstantSDNode>(Ops[2])->isZero() &&
977 cast<ConstantSDNode>(Ops[4])->isZero();
978 unsigned Opcode = UseFMAC ? AMDGPU::V_FMAC_F32_e64 : AMDGPU::V_FMA_F32_e64;
979 CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), Ops);
980}
981
982void AMDGPUDAGToDAGISel::SelectFMUL_W_CHAIN(SDNode *N) {
983 SDLoc SL(N);
984 // src0_modifiers, src0, src1_modifiers, src1, clamp, omod
985 SDValue Ops[8];
986
987 SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[4], Ops[5]);
988 SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
989 Ops[6] = N->getOperand(0);
990 Ops[7] = N->getOperand(3);
991
992 CurDAG->SelectNodeTo(N, AMDGPU::V_MUL_F32_e64, N->getVTList(), Ops);
993}
994
995// We need to handle this here because tablegen doesn't support matching
996// instructions with multiple outputs.
997void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
998 SDLoc SL(N);
999 EVT VT = N->getValueType(0);
1000
1001 assert(VT == MVT::f32 || VT == MVT::f64);
1002
1003 unsigned Opc
1004 = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64_e64 : AMDGPU::V_DIV_SCALE_F32_e64;
1005
1006 // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp,
1007 // omod
1008 SDValue Ops[8];
1009 SelectVOP3BMods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]);
1010 SelectVOP3BMods(N->getOperand(1), Ops[3], Ops[2]);
1011 SelectVOP3BMods(N->getOperand(2), Ops[5], Ops[4]);
1012 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
1013}
1014
1015// We need to handle this here because tablegen doesn't support matching
1016// instructions with multiple outputs.
1017void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {
1018 SDLoc SL(N);
1019 bool Signed = N->getOpcode() == AMDGPUISD::MAD_I64_I32;
1020 unsigned Opc;
1021 if (Subtarget->hasMADIntraFwdBug())
1022 Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
1023 : AMDGPU::V_MAD_U64_U32_gfx11_e64;
1024 else
1025 Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
1026
1027 SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
1028 SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
1029 Clamp };
1030 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
1031}
1032
1033// We need to handle this here because tablegen doesn't support matching
1034// instructions with multiple outputs.
1035void AMDGPUDAGToDAGISel::SelectMUL_LOHI(SDNode *N) {
1036 SDLoc SL(N);
1037 bool Signed = N->getOpcode() == ISD::SMUL_LOHI;
1038 unsigned Opc;
1039 if (Subtarget->hasMADIntraFwdBug())
1040 Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
1041 : AMDGPU::V_MAD_U64_U32_gfx11_e64;
1042 else
1043 Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
1044
1045 SDValue Zero = CurDAG->getTargetConstant(0, SL, MVT::i64);
1046 SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
1047 SDValue Ops[] = {N->getOperand(0), N->getOperand(1), Zero, Clamp};
1049 Opc, SL, CurDAG->getVTList(MVT::i64, MVT::i1), Ops);
1050 if (!SDValue(N, 0).use_empty()) {
1051 SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32);
1052 SDNode *Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL,
1053 MVT::i32, SDValue(Mad, 0), Sub0);
1054 ReplaceUses(SDValue(N, 0), SDValue(Lo, 0));
1055 }
1056 if (!SDValue(N, 1).use_empty()) {
1057 SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32);
1058 SDNode *Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL,
1059 MVT::i32, SDValue(Mad, 0), Sub1);
1060 ReplaceUses(SDValue(N, 1), SDValue(Hi, 0));
1061 }
1063}
1064
1065bool AMDGPUDAGToDAGISel::isDSOffsetLegal(SDValue Base, unsigned Offset) const {
1066 if (!isUInt<16>(Offset))
1067 return false;
1068
1069 if (!Base || Subtarget->hasUsableDSOffset() ||
1070 Subtarget->unsafeDSOffsetFoldingEnabled())
1071 return true;
1072
1073 // On Southern Islands instruction with a negative base value and an offset
1074 // don't seem to work.
1075 return CurDAG->SignBitIsZero(Base);
1076}
1077
1078bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
1079 SDValue &Offset) const {
1080 SDLoc DL(Addr);
1082 SDValue N0 = Addr.getOperand(0);
1083 SDValue N1 = Addr.getOperand(1);
1084 ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1085 if (isDSOffsetLegal(N0, C1->getSExtValue())) {
1086 // (add n0, c0)
1087 Base = N0;
1088 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
1089 return true;
1090 }
1091 } else if (Addr.getOpcode() == ISD::SUB) {
1092 // sub C, x -> add (sub 0, x), C
1093 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
1094 int64_t ByteOffset = C->getSExtValue();
1095 if (isDSOffsetLegal(SDValue(), ByteOffset)) {
1096 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1097
1098 // XXX - This is kind of hacky. Create a dummy sub node so we can check
1099 // the known bits in isDSOffsetLegal. We need to emit the selected node
1100 // here, so this is thrown away.
1101 SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32,
1102 Zero, Addr.getOperand(1));
1103
1104 if (isDSOffsetLegal(Sub, ByteOffset)) {
1106 Opnds.push_back(Zero);
1107 Opnds.push_back(Addr.getOperand(1));
1108
1109 // FIXME: Select to VOP3 version for with-carry.
1110 unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1111 if (Subtarget->hasAddNoCarry()) {
1112 SubOp = AMDGPU::V_SUB_U32_e64;
1113 Opnds.push_back(
1114 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
1115 }
1116
1117 MachineSDNode *MachineSub =
1118 CurDAG->getMachineNode(SubOp, DL, MVT::i32, Opnds);
1119
1120 Base = SDValue(MachineSub, 0);
1121 Offset = CurDAG->getTargetConstant(ByteOffset, DL, MVT::i16);
1122 return true;
1123 }
1124 }
1125 }
1126 } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1127 // If we have a constant address, prefer to put the constant into the
1128 // offset. This can save moves to load the constant address since multiple
1129 // operations can share the zero base address register, and enables merging
1130 // into read2 / write2 instructions.
1131
1132 SDLoc DL(Addr);
1133
1134 if (isDSOffsetLegal(SDValue(), CAddr->getZExtValue())) {
1135 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1136 MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
1137 DL, MVT::i32, Zero);
1138 Base = SDValue(MovZero, 0);
1139 Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16);
1140 return true;
1141 }
1142 }
1143
1144 // default case
1145 Base = Addr;
1146 Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i16);
1147 return true;
1148}
1149
1150bool AMDGPUDAGToDAGISel::isDSOffset2Legal(SDValue Base, unsigned Offset0,
1151 unsigned Offset1,
1152 unsigned Size) const {
1153 if (Offset0 % Size != 0 || Offset1 % Size != 0)
1154 return false;
1155 if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
1156 return false;
1157
1158 if (!Base || Subtarget->hasUsableDSOffset() ||
1159 Subtarget->unsafeDSOffsetFoldingEnabled())
1160 return true;
1161
1162 // On Southern Islands instruction with a negative base value and an offset
1163 // don't seem to work.
1164 return CurDAG->SignBitIsZero(Base);
1165}
1166
1167// Return whether the operation has NoUnsignedWrap property.
1169 return (Addr.getOpcode() == ISD::ADD &&
1170 Addr->getFlags().hasNoUnsignedWrap()) ||
1171 Addr->getOpcode() == ISD::OR;
1172}
1173
1174// Check that the base address of flat scratch load/store in the form of `base +
1175// offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware
1176// requirement). We always treat the first operand as the base address here.
1177bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue Addr) const {
1179 return true;
1180
1181 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1182 // values.
1183 if (Subtarget->hasSignedScratchOffsets())
1184 return true;
1185
1186 auto LHS = Addr.getOperand(0);
1187 auto RHS = Addr.getOperand(1);
1188
1189 // If the immediate offset is negative and within certain range, the base
1190 // address cannot also be negative. If the base is also negative, the sum
1191 // would be either negative or much larger than the valid range of scratch
1192 // memory a thread can access.
1193 ConstantSDNode *ImmOp = nullptr;
1194 if (Addr.getOpcode() == ISD::ADD && (ImmOp = dyn_cast<ConstantSDNode>(RHS))) {
1195 if (ImmOp->getSExtValue() < 0 && ImmOp->getSExtValue() > -0x40000000)
1196 return true;
1197 }
1198
1199 return CurDAG->SignBitIsZero(LHS);
1200}
1201
1202// Check address value in SGPR/VGPR are legal for flat scratch in the form
1203// of: SGPR + VGPR.
1204bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSV(SDValue Addr) const {
1206 return true;
1207
1208 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1209 // values.
1210 if (Subtarget->hasSignedScratchOffsets())
1211 return true;
1212
1213 auto LHS = Addr.getOperand(0);
1214 auto RHS = Addr.getOperand(1);
1215 return CurDAG->SignBitIsZero(RHS) && CurDAG->SignBitIsZero(LHS);
1216}
1217
1218// Check address value in SGPR/VGPR are legal for flat scratch in the form
1219// of: SGPR + VGPR + Imm.
1220bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSVImm(SDValue Addr) const {
1221 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1222 // values.
1223 if (AMDGPU::isGFX12Plus(*Subtarget))
1224 return true;
1225
1226 auto Base = Addr.getOperand(0);
1227 auto *RHSImm = cast<ConstantSDNode>(Addr.getOperand(1));
1228 // If the immediate offset is negative and within certain range, the base
1229 // address cannot also be negative. If the base is also negative, the sum
1230 // would be either negative or much larger than the valid range of scratch
1231 // memory a thread can access.
1232 if (isNoUnsignedWrap(Base) &&
1234 (RHSImm->getSExtValue() < 0 && RHSImm->getSExtValue() > -0x40000000)))
1235 return true;
1236
1237 auto LHS = Base.getOperand(0);
1238 auto RHS = Base.getOperand(1);
1239 return CurDAG->SignBitIsZero(RHS) && CurDAG->SignBitIsZero(LHS);
1240}
1241
1242// TODO: If offset is too big, put low 16-bit into offset.
1243bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,
1244 SDValue &Offset0,
1245 SDValue &Offset1) const {
1246 return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 4);
1247}
1248
1249bool AMDGPUDAGToDAGISel::SelectDS128Bit8ByteAligned(SDValue Addr, SDValue &Base,
1250 SDValue &Offset0,
1251 SDValue &Offset1) const {
1252 return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 8);
1253}
1254
1255bool AMDGPUDAGToDAGISel::SelectDSReadWrite2(SDValue Addr, SDValue &Base,
1256 SDValue &Offset0, SDValue &Offset1,
1257 unsigned Size) const {
1258 SDLoc DL(Addr);
1259
1261 SDValue N0 = Addr.getOperand(0);
1262 SDValue N1 = Addr.getOperand(1);
1263 ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1264 unsigned OffsetValue0 = C1->getZExtValue();
1265 unsigned OffsetValue1 = OffsetValue0 + Size;
1266
1267 // (add n0, c0)
1268 if (isDSOffset2Legal(N0, OffsetValue0, OffsetValue1, Size)) {
1269 Base = N0;
1270 Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i32);
1271 Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i32);
1272 return true;
1273 }
1274 } else if (Addr.getOpcode() == ISD::SUB) {
1275 // sub C, x -> add (sub 0, x), C
1276 if (const ConstantSDNode *C =
1277 dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
1278 unsigned OffsetValue0 = C->getZExtValue();
1279 unsigned OffsetValue1 = OffsetValue0 + Size;
1280
1281 if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {
1282 SDLoc DL(Addr);
1283 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1284
1285 // XXX - This is kind of hacky. Create a dummy sub node so we can check
1286 // the known bits in isDSOffsetLegal. We need to emit the selected node
1287 // here, so this is thrown away.
1288 SDValue Sub =
1289 CurDAG->getNode(ISD::SUB, DL, MVT::i32, Zero, Addr.getOperand(1));
1290
1291 if (isDSOffset2Legal(Sub, OffsetValue0, OffsetValue1, Size)) {
1293 Opnds.push_back(Zero);
1294 Opnds.push_back(Addr.getOperand(1));
1295 unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1296 if (Subtarget->hasAddNoCarry()) {
1297 SubOp = AMDGPU::V_SUB_U32_e64;
1298 Opnds.push_back(
1299 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
1300 }
1301
1302 MachineSDNode *MachineSub = CurDAG->getMachineNode(
1303 SubOp, DL, MVT::getIntegerVT(Size * 8), Opnds);
1304
1305 Base = SDValue(MachineSub, 0);
1306 Offset0 =
1307 CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i32);
1308 Offset1 =
1309 CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i32);
1310 return true;
1311 }
1312 }
1313 }
1314 } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1315 unsigned OffsetValue0 = CAddr->getZExtValue();
1316 unsigned OffsetValue1 = OffsetValue0 + Size;
1317
1318 if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {
1319 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1320 MachineSDNode *MovZero =
1321 CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32, Zero);
1322 Base = SDValue(MovZero, 0);
1323 Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i32);
1324 Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i32);
1325 return true;
1326 }
1327 }
1328
1329 // default case
1330
1331 Base = Addr;
1332 Offset0 = CurDAG->getTargetConstant(0, DL, MVT::i32);
1333 Offset1 = CurDAG->getTargetConstant(1, DL, MVT::i32);
1334 return true;
1335}
1336
1337bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDValue &VAddr,
1338 SDValue &SOffset, SDValue &Offset,
1339 SDValue &Offen, SDValue &Idxen,
1340 SDValue &Addr64) const {
1341 // Subtarget prefers to use flat instruction
1342 // FIXME: This should be a pattern predicate and not reach here
1343 if (Subtarget->useFlatForGlobal())
1344 return false;
1345
1346 SDLoc DL(Addr);
1347
1348 Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1);
1349 Offen = CurDAG->getTargetConstant(0, DL, MVT::i1);
1350 Addr64 = CurDAG->getTargetConstant(0, DL, MVT::i1);
1351 SOffset = Subtarget->hasRestrictedSOffset()
1352 ? CurDAG->getRegister(AMDGPU::SGPR_NULL, MVT::i32)
1353 : CurDAG->getTargetConstant(0, DL, MVT::i32);
1354
1355 ConstantSDNode *C1 = nullptr;
1356 SDValue N0 = Addr;
1358 C1 = cast<ConstantSDNode>(Addr.getOperand(1));
1359 if (isUInt<32>(C1->getZExtValue()))
1360 N0 = Addr.getOperand(0);
1361 else
1362 C1 = nullptr;
1363 }
1364
1365 if (N0.getOpcode() == ISD::ADD) {
1366 // (add N2, N3) -> addr64, or
1367 // (add (add N2, N3), C1) -> addr64
1368 SDValue N2 = N0.getOperand(0);
1369 SDValue N3 = N0.getOperand(1);
1370 Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
1371
1372 if (N2->isDivergent()) {
1373 if (N3->isDivergent()) {
1374 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
1375 // addr64, and construct the resource from a 0 address.
1376 Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
1377 VAddr = N0;
1378 } else {
1379 // N2 is divergent, N3 is not.
1380 Ptr = N3;
1381 VAddr = N2;
1382 }
1383 } else {
1384 // N2 is not divergent.
1385 Ptr = N2;
1386 VAddr = N3;
1387 }
1388 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1389 } else if (N0->isDivergent()) {
1390 // N0 is divergent. Use it as the addr64, and construct the resource from a
1391 // 0 address.
1392 Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
1393 VAddr = N0;
1394 Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
1395 } else {
1396 // N0 -> offset, or
1397 // (N0 + C1) -> offset
1398 VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32);
1399 Ptr = N0;
1400 }
1401
1402 if (!C1) {
1403 // No offset.
1404 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1405 return true;
1406 }
1407
1408 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1409 if (TII->isLegalMUBUFImmOffset(C1->getZExtValue())) {
1410 // Legal offset for instruction.
1411 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
1412 return true;
1413 }
1414
1415 // Illegal offset, store it in soffset.
1416 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1417 SOffset =
1419 AMDGPU::S_MOV_B32, DL, MVT::i32,
1420 CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32)),
1421 0);
1422 return true;
1423}
1424
1425bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
1426 SDValue &VAddr, SDValue &SOffset,
1427 SDValue &Offset) const {
1428 SDValue Ptr, Offen, Idxen, Addr64;
1429
1430 // addr64 bit was removed for volcanic islands.
1431 // FIXME: This should be a pattern predicate and not reach here
1432 if (!Subtarget->hasAddr64())
1433 return false;
1434
1435 if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
1436 return false;
1437
1438 ConstantSDNode *C = cast<ConstantSDNode>(Addr64);
1439 if (C->getSExtValue()) {
1440 SDLoc DL(Addr);
1441
1442 const SITargetLowering& Lowering =
1443 *static_cast<const SITargetLowering*>(getTargetLowering());
1444
1445 SRsrc = SDValue(Lowering.wrapAddr64Rsrc(*CurDAG, DL, Ptr), 0);
1446 return true;
1447 }
1448
1449 return false;
1450}
1451
1452std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const {
1453 SDLoc DL(N);
1454
1455 auto *FI = dyn_cast<FrameIndexSDNode>(N);
1456 SDValue TFI =
1457 FI ? CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0)) : N;
1458
1459 // We rebase the base address into an absolute stack address and hence
1460 // use constant 0 for soffset. This value must be retained until
1461 // frame elimination and eliminateFrameIndex will choose the appropriate
1462 // frame register if need be.
1463 return std::pair(TFI, CurDAG->getTargetConstant(0, DL, MVT::i32));
1464}
1465
1466bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent,
1467 SDValue Addr, SDValue &Rsrc,
1468 SDValue &VAddr, SDValue &SOffset,
1469 SDValue &ImmOffset) const {
1470
1471 SDLoc DL(Addr);
1474
1475 Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1476
1477 if (ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1478 int64_t Imm = CAddr->getSExtValue();
1479 const int64_t NullPtr =
1481 // Don't fold null pointer.
1482 if (Imm != NullPtr) {
1483 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
1484 SDValue HighBits =
1485 CurDAG->getTargetConstant(Imm & ~MaxOffset, DL, MVT::i32);
1486 MachineSDNode *MovHighBits = CurDAG->getMachineNode(
1487 AMDGPU::V_MOV_B32_e32, DL, MVT::i32, HighBits);
1488 VAddr = SDValue(MovHighBits, 0);
1489
1490 SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1491 ImmOffset = CurDAG->getTargetConstant(Imm & MaxOffset, DL, MVT::i32);
1492 return true;
1493 }
1494 }
1495
1497 // (add n0, c1)
1498
1499 SDValue N0 = Addr.getOperand(0);
1500 uint64_t C1 = Addr.getConstantOperandVal(1);
1501
1502 // Offsets in vaddr must be positive if range checking is enabled.
1503 //
1504 // The total computation of vaddr + soffset + offset must not overflow. If
1505 // vaddr is negative, even if offset is 0 the sgpr offset add will end up
1506 // overflowing.
1507 //
1508 // Prior to gfx9, MUBUF instructions with the vaddr offset enabled would
1509 // always perform a range check. If a negative vaddr base index was used,
1510 // this would fail the range check. The overall address computation would
1511 // compute a valid address, but this doesn't happen due to the range
1512 // check. For out-of-bounds MUBUF loads, a 0 is returned.
1513 //
1514 // Therefore it should be safe to fold any VGPR offset on gfx9 into the
1515 // MUBUF vaddr, but not on older subtargets which can only do this if the
1516 // sign bit is known 0.
1517 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1518 if (TII->isLegalMUBUFImmOffset(C1) &&
1519 (!Subtarget->privateMemoryResourceIsRangeChecked() ||
1520 CurDAG->SignBitIsZero(N0))) {
1521 std::tie(VAddr, SOffset) = foldFrameIndex(N0);
1522 ImmOffset = CurDAG->getTargetConstant(C1, DL, MVT::i32);
1523 return true;
1524 }
1525 }
1526
1527 // (node)
1528 std::tie(VAddr, SOffset) = foldFrameIndex(Addr);
1529 ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1530 return true;
1531}
1532
1533static bool IsCopyFromSGPR(const SIRegisterInfo &TRI, SDValue Val) {
1534 if (Val.getOpcode() != ISD::CopyFromReg)
1535 return false;
1536 auto Reg = cast<RegisterSDNode>(Val.getOperand(1))->getReg();
1537 if (!Reg.isPhysical())
1538 return false;
1539 const auto *RC = TRI.getPhysRegBaseClass(Reg);
1540 return RC && TRI.isSGPRClass(RC);
1541}
1542
1543bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent,
1544 SDValue Addr,
1545 SDValue &SRsrc,
1546 SDValue &SOffset,
1547 SDValue &Offset) const {
1548 const SIRegisterInfo *TRI =
1549 static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
1550 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1553 SDLoc DL(Addr);
1554
1555 // CopyFromReg <sgpr>
1556 if (IsCopyFromSGPR(*TRI, Addr)) {
1557 SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1558 SOffset = Addr;
1559 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1560 return true;
1561 }
1562
1563 ConstantSDNode *CAddr;
1564 if (Addr.getOpcode() == ISD::ADD) {
1565 // Add (CopyFromReg <sgpr>) <constant>
1566 CAddr = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
1567 if (!CAddr || !TII->isLegalMUBUFImmOffset(CAddr->getZExtValue()))
1568 return false;
1569 if (!IsCopyFromSGPR(*TRI, Addr.getOperand(0)))
1570 return false;
1571
1572 SOffset = Addr.getOperand(0);
1573 } else if ((CAddr = dyn_cast<ConstantSDNode>(Addr)) &&
1574 TII->isLegalMUBUFImmOffset(CAddr->getZExtValue())) {
1575 // <constant>
1576 SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1577 } else {
1578 return false;
1579 }
1580
1581 SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1582
1583 Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i32);
1584 return true;
1585}
1586
1587bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
1588 SDValue &SOffset, SDValue &Offset
1589 ) const {
1590 SDValue Ptr, VAddr, Offen, Idxen, Addr64;
1591 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1592
1593 if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
1594 return false;
1595
1596 if (!cast<ConstantSDNode>(Offen)->getSExtValue() &&
1597 !cast<ConstantSDNode>(Idxen)->getSExtValue() &&
1598 !cast<ConstantSDNode>(Addr64)->getSExtValue()) {
1599 uint64_t Rsrc = TII->getDefaultRsrcDataFormat() |
1600 maskTrailingOnes<uint64_t>(32); // Size
1601 SDLoc DL(Addr);
1602
1603 const SITargetLowering& Lowering =
1604 *static_cast<const SITargetLowering*>(getTargetLowering());
1605
1606 SRsrc = SDValue(Lowering.buildRSRC(*CurDAG, DL, Ptr, 0, Rsrc), 0);
1607 return true;
1608 }
1609 return false;
1610}
1611
1612bool AMDGPUDAGToDAGISel::SelectBUFSOffset(SDValue ByteOffsetNode,
1613 SDValue &SOffset) const {
1614 if (Subtarget->hasRestrictedSOffset() && isNullConstant(ByteOffsetNode)) {
1615 SOffset = CurDAG->getRegister(AMDGPU::SGPR_NULL, MVT::i32);
1616 return true;
1617 }
1618
1619 SOffset = ByteOffsetNode;
1620 return true;
1621}
1622
1623// Find a load or store from corresponding pattern root.
1624// Roots may be build_vector, bitconvert or their combinations.
1627 if (MemSDNode *MN = dyn_cast<MemSDNode>(N))
1628 return MN;
1629 assert(isa<BuildVectorSDNode>(N));
1630 for (SDValue V : N->op_values())
1631 if (MemSDNode *MN =
1632 dyn_cast<MemSDNode>(AMDGPUTargetLowering::stripBitcast(V)))
1633 return MN;
1634 llvm_unreachable("cannot find MemSDNode in the pattern!");
1635}
1636
1637bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr,
1638 SDValue &VAddr, SDValue &Offset,
1639 uint64_t FlatVariant) const {
1640 int64_t OffsetVal = 0;
1641
1642 unsigned AS = findMemSDNode(N)->getAddressSpace();
1643
1644 bool CanHaveFlatSegmentOffsetBug =
1645 Subtarget->hasFlatSegmentOffsetBug() &&
1646 FlatVariant == SIInstrFlags::FLAT &&
1648
1649 if (Subtarget->hasFlatInstOffsets() && !CanHaveFlatSegmentOffsetBug) {
1650 SDValue N0, N1;
1651 if (isBaseWithConstantOffset64(Addr, N0, N1) &&
1652 (FlatVariant != SIInstrFlags::FlatScratch ||
1653 isFlatScratchBaseLegal(Addr))) {
1654 int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
1655
1656 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1657 if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) {
1658 Addr = N0;
1659 OffsetVal = COffsetVal;
1660 } else {
1661 // If the offset doesn't fit, put the low bits into the offset field and
1662 // add the rest.
1663 //
1664 // For a FLAT instruction the hardware decides whether to access
1665 // global/scratch/shared memory based on the high bits of vaddr,
1666 // ignoring the offset field, so we have to ensure that when we add
1667 // remainder to vaddr it still points into the same underlying object.
1668 // The easiest way to do that is to make sure that we split the offset
1669 // into two pieces that are both >= 0 or both <= 0.
1670
1671 SDLoc DL(N);
1672 uint64_t RemainderOffset;
1673
1674 std::tie(OffsetVal, RemainderOffset) =
1675 TII->splitFlatOffset(COffsetVal, AS, FlatVariant);
1676
1677 SDValue AddOffsetLo =
1678 getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
1679 SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
1680
1681 if (Addr.getValueType().getSizeInBits() == 32) {
1683 Opnds.push_back(N0);
1684 Opnds.push_back(AddOffsetLo);
1685 unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32;
1686 if (Subtarget->hasAddNoCarry()) {
1687 AddOp = AMDGPU::V_ADD_U32_e64;
1688 Opnds.push_back(Clamp);
1689 }
1690 Addr = SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0);
1691 } else {
1692 // TODO: Should this try to use a scalar add pseudo if the base address
1693 // is uniform and saddr is usable?
1694 SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
1695 SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
1696
1697 SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1698 DL, MVT::i32, N0, Sub0);
1699 SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1700 DL, MVT::i32, N0, Sub1);
1701
1702 SDValue AddOffsetHi =
1703 getMaterializedScalarImm32(Hi_32(RemainderOffset), DL);
1704
1705 SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1);
1706
1707 SDNode *Add =
1708 CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs,
1709 {AddOffsetLo, SDValue(N0Lo, 0), Clamp});
1710
1711 SDNode *Addc = CurDAG->getMachineNode(
1712 AMDGPU::V_ADDC_U32_e64, DL, VTs,
1713 {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp});
1714
1715 SDValue RegSequenceArgs[] = {
1716 CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32),
1717 SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1};
1718
1719 Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
1720 MVT::i64, RegSequenceArgs),
1721 0);
1722 }
1723 }
1724 }
1725 }
1726
1727 VAddr = Addr;
1728 Offset = CurDAG->getSignedTargetConstant(OffsetVal, SDLoc(), MVT::i32);
1729 return true;
1730}
1731
1732bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N, SDValue Addr,
1733 SDValue &VAddr,
1734 SDValue &Offset) const {
1735 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, SIInstrFlags::FLAT);
1736}
1737
1738bool AMDGPUDAGToDAGISel::SelectGlobalOffset(SDNode *N, SDValue Addr,
1739 SDValue &VAddr,
1740 SDValue &Offset) const {
1741 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, SIInstrFlags::FlatGlobal);
1742}
1743
1744bool AMDGPUDAGToDAGISel::SelectScratchOffset(SDNode *N, SDValue Addr,
1745 SDValue &VAddr,
1746 SDValue &Offset) const {
1747 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset,
1749}
1750
1751// If this matches zero_extend i32:x, return x
1753 if (Op.getOpcode() != ISD::ZERO_EXTEND)
1754 return SDValue();
1755
1756 SDValue ExtSrc = Op.getOperand(0);
1757 return (ExtSrc.getValueType() == MVT::i32) ? ExtSrc : SDValue();
1758}
1759
1760// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
1761bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
1762 SDValue Addr,
1763 SDValue &SAddr,
1764 SDValue &VOffset,
1765 SDValue &Offset) const {
1766 int64_t ImmOffset = 0;
1767
1768 // Match the immediate offset first, which canonically is moved as low as
1769 // possible.
1770
1771 SDValue LHS, RHS;
1772 if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
1773 int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
1774 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1775
1776 if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS,
1778 Addr = LHS;
1779 ImmOffset = COffsetVal;
1780 } else if (!LHS->isDivergent()) {
1781 if (COffsetVal > 0) {
1782 SDLoc SL(N);
1783 // saddr + large_offset -> saddr +
1784 // (voffset = large_offset & ~MaxOffset) +
1785 // (large_offset & MaxOffset);
1786 int64_t SplitImmOffset, RemainderOffset;
1787 std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
1789
1790 if (isUInt<32>(RemainderOffset)) {
1791 SDNode *VMov = CurDAG->getMachineNode(
1792 AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
1793 CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
1794 VOffset = SDValue(VMov, 0);
1795 SAddr = LHS;
1796 Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i32);
1797 return true;
1798 }
1799 }
1800
1801 // We are adding a 64 bit SGPR and a constant. If constant bus limit
1802 // is 1 we would need to perform 1 or 2 extra moves for each half of
1803 // the constant and it is better to do a scalar add and then issue a
1804 // single VALU instruction to materialize zero. Otherwise it is less
1805 // instructions to perform VALU adds with immediates or inline literals.
1806 unsigned NumLiterals =
1807 !TII->isInlineConstant(APInt(32, Lo_32(COffsetVal))) +
1808 !TII->isInlineConstant(APInt(32, Hi_32(COffsetVal)));
1809 if (Subtarget->getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
1810 return false;
1811 }
1812 }
1813
1814 // Match the variable offset.
1815 if (Addr.getOpcode() == ISD::ADD) {
1816 LHS = Addr.getOperand(0);
1817 RHS = Addr.getOperand(1);
1818
1819 if (!LHS->isDivergent()) {
1820 // add (i64 sgpr), (zero_extend (i32 vgpr))
1821 if (SDValue ZextRHS = matchZExtFromI32(RHS)) {
1822 SAddr = LHS;
1823 VOffset = ZextRHS;
1824 }
1825 }
1826
1827 if (!SAddr && !RHS->isDivergent()) {
1828 // add (zero_extend (i32 vgpr)), (i64 sgpr)
1829 if (SDValue ZextLHS = matchZExtFromI32(LHS)) {
1830 SAddr = RHS;
1831 VOffset = ZextLHS;
1832 }
1833 }
1834
1835 if (SAddr) {
1836 Offset = CurDAG->getSignedTargetConstant(ImmOffset, SDLoc(), MVT::i32);
1837 return true;
1838 }
1839 }
1840
1841 if (Addr->isDivergent() || Addr.getOpcode() == ISD::UNDEF ||
1842 isa<ConstantSDNode>(Addr))
1843 return false;
1844
1845 // It's cheaper to materialize a single 32-bit zero for vaddr than the two
1846 // moves required to copy a 64-bit SGPR to VGPR.
1847 SAddr = Addr;
1848 SDNode *VMov =
1849 CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, SDLoc(Addr), MVT::i32,
1850 CurDAG->getTargetConstant(0, SDLoc(), MVT::i32));
1851 VOffset = SDValue(VMov, 0);
1852 Offset = CurDAG->getSignedTargetConstant(ImmOffset, SDLoc(), MVT::i32);
1853 return true;
1854}
1855
1857 if (auto *FI = dyn_cast<FrameIndexSDNode>(SAddr)) {
1858 SAddr = CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0));
1859 } else if (SAddr.getOpcode() == ISD::ADD &&
1860 isa<FrameIndexSDNode>(SAddr.getOperand(0))) {
1861 // Materialize this into a scalar move for scalar address to avoid
1862 // readfirstlane.
1863 auto *FI = cast<FrameIndexSDNode>(SAddr.getOperand(0));
1864 SDValue TFI = CurDAG->getTargetFrameIndex(FI->getIndex(),
1865 FI->getValueType(0));
1866 SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, SDLoc(SAddr),
1867 MVT::i32, TFI, SAddr.getOperand(1)),
1868 0);
1869 }
1870
1871 return SAddr;
1872}
1873
1874// Match (32-bit SGPR base) + sext(imm offset)
1875bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr,
1876 SDValue &SAddr,
1877 SDValue &Offset) const {
1878 if (Addr->isDivergent())
1879 return false;
1880
1881 SDLoc DL(Addr);
1882
1883 int64_t COffsetVal = 0;
1884
1885 if (CurDAG->isBaseWithConstantOffset(Addr) && isFlatScratchBaseLegal(Addr)) {
1886 COffsetVal = cast<ConstantSDNode>(Addr.getOperand(1))->getSExtValue();
1887 SAddr = Addr.getOperand(0);
1888 } else {
1889 SAddr = Addr;
1890 }
1891
1892 SAddr = SelectSAddrFI(CurDAG, SAddr);
1893
1894 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1895
1896 if (!TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS,
1898 int64_t SplitImmOffset, RemainderOffset;
1899 std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
1901
1902 COffsetVal = SplitImmOffset;
1903
1904 SDValue AddOffset =
1906 ? getMaterializedScalarImm32(Lo_32(RemainderOffset), DL)
1907 : CurDAG->getSignedTargetConstant(RemainderOffset, DL, MVT::i32);
1908 SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, DL, MVT::i32,
1909 SAddr, AddOffset),
1910 0);
1911 }
1912
1913 Offset = CurDAG->getSignedTargetConstant(COffsetVal, DL, MVT::i32);
1914
1915 return true;
1916}
1917
1918// Check whether the flat scratch SVS swizzle bug affects this access.
1919bool AMDGPUDAGToDAGISel::checkFlatScratchSVSSwizzleBug(
1920 SDValue VAddr, SDValue SAddr, uint64_t ImmOffset) const {
1921 if (!Subtarget->hasFlatScratchSVSSwizzleBug())
1922 return false;
1923
1924 // The bug affects the swizzling of SVS accesses if there is any carry out
1925 // from the two low order bits (i.e. from bit 1 into bit 2) when adding
1926 // voffset to (soffset + inst_offset).
1927 KnownBits VKnown = CurDAG->computeKnownBits(VAddr);
1928 KnownBits SKnown =
1930 KnownBits::makeConstant(APInt(32, ImmOffset,
1931 /*isSigned=*/true)));
1932 uint64_t VMax = VKnown.getMaxValue().getZExtValue();
1934 return (VMax & 3) + (SMax & 3) >= 4;
1935}
1936
1937bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
1938 SDValue &VAddr, SDValue &SAddr,
1939 SDValue &Offset) const {
1940 int64_t ImmOffset = 0;
1941
1942 SDValue LHS, RHS;
1943 SDValue OrigAddr = Addr;
1944 if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
1945 int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
1946 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1947
1948 if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, true)) {
1949 Addr = LHS;
1950 ImmOffset = COffsetVal;
1951 } else if (!LHS->isDivergent() && COffsetVal > 0) {
1952 SDLoc SL(N);
1953 // saddr + large_offset -> saddr + (vaddr = large_offset & ~MaxOffset) +
1954 // (large_offset & MaxOffset);
1955 int64_t SplitImmOffset, RemainderOffset;
1956 std::tie(SplitImmOffset, RemainderOffset)
1957 = TII->splitFlatOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, true);
1958
1959 if (isUInt<32>(RemainderOffset)) {
1960 SDNode *VMov = CurDAG->getMachineNode(
1961 AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
1962 CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
1963 VAddr = SDValue(VMov, 0);
1964 SAddr = LHS;
1965 if (!isFlatScratchBaseLegal(Addr))
1966 return false;
1967 if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, SplitImmOffset))
1968 return false;
1969 Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i32);
1970 return true;
1971 }
1972 }
1973 }
1974
1975 if (Addr.getOpcode() != ISD::ADD)
1976 return false;
1977
1978 LHS = Addr.getOperand(0);
1979 RHS = Addr.getOperand(1);
1980
1981 if (!LHS->isDivergent() && RHS->isDivergent()) {
1982 SAddr = LHS;
1983 VAddr = RHS;
1984 } else if (!RHS->isDivergent() && LHS->isDivergent()) {
1985 SAddr = RHS;
1986 VAddr = LHS;
1987 } else {
1988 return false;
1989 }
1990
1991 if (OrigAddr != Addr) {
1992 if (!isFlatScratchBaseLegalSVImm(OrigAddr))
1993 return false;
1994 } else {
1995 if (!isFlatScratchBaseLegalSV(OrigAddr))
1996 return false;
1997 }
1998
1999 if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, ImmOffset))
2000 return false;
2001 SAddr = SelectSAddrFI(CurDAG, SAddr);
2002 Offset = CurDAG->getSignedTargetConstant(ImmOffset, SDLoc(), MVT::i32);
2003 return true;
2004}
2005
2006// For unbuffered smem loads, it is illegal for the Immediate Offset to be
2007// negative if the resulting (Offset + (M0 or SOffset or zero) is negative.
2008// Handle the case where the Immediate Offset + SOffset is negative.
2009bool AMDGPUDAGToDAGISel::isSOffsetLegalWithImmOffset(SDValue *SOffset,
2010 bool Imm32Only,
2011 bool IsBuffer,
2012 int64_t ImmOffset) const {
2013 if (!IsBuffer && !Imm32Only && ImmOffset < 0 &&
2014 AMDGPU::hasSMRDSignedImmOffset(*Subtarget)) {
2015 KnownBits SKnown = CurDAG->computeKnownBits(*SOffset);
2016 if (ImmOffset + SKnown.getMinValue().getSExtValue() < 0)
2017 return false;
2018 }
2019
2020 return true;
2021}
2022
2023// Match an immediate (if Offset is not null) or an SGPR (if SOffset is
2024// not null) offset. If Imm32Only is true, match only 32-bit immediate
2025// offsets available on CI.
2026bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
2027 SDValue *SOffset, SDValue *Offset,
2028 bool Imm32Only, bool IsBuffer,
2029 bool HasSOffset,
2030 int64_t ImmOffset) const {
2031 assert((!SOffset || !Offset) &&
2032 "Cannot match both soffset and offset at the same time!");
2033
2034 ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode);
2035 if (!C) {
2036 if (!SOffset)
2037 return false;
2038
2039 if (ByteOffsetNode.getValueType().isScalarInteger() &&
2040 ByteOffsetNode.getValueType().getSizeInBits() == 32) {
2041 *SOffset = ByteOffsetNode;
2042 return isSOffsetLegalWithImmOffset(SOffset, Imm32Only, IsBuffer,
2043 ImmOffset);
2044 }
2045 if (ByteOffsetNode.getOpcode() == ISD::ZERO_EXTEND) {
2046 if (ByteOffsetNode.getOperand(0).getValueType().getSizeInBits() == 32) {
2047 *SOffset = ByteOffsetNode.getOperand(0);
2048 return isSOffsetLegalWithImmOffset(SOffset, Imm32Only, IsBuffer,
2049 ImmOffset);
2050 }
2051 }
2052 return false;
2053 }
2054
2055 SDLoc SL(ByteOffsetNode);
2056
2057 // GFX9 and GFX10 have signed byte immediate offsets. The immediate
2058 // offset for S_BUFFER instructions is unsigned.
2059 int64_t ByteOffset = IsBuffer ? C->getZExtValue() : C->getSExtValue();
2060 std::optional<int64_t> EncodedOffset = AMDGPU::getSMRDEncodedOffset(
2061 *Subtarget, ByteOffset, IsBuffer, HasSOffset);
2062 if (EncodedOffset && Offset && !Imm32Only) {
2063 *Offset = CurDAG->getSignedTargetConstant(*EncodedOffset, SL, MVT::i32);
2064 return true;
2065 }
2066
2067 // SGPR and literal offsets are unsigned.
2068 if (ByteOffset < 0)
2069 return false;
2070
2071 EncodedOffset = AMDGPU::getSMRDEncodedLiteralOffset32(*Subtarget, ByteOffset);
2072 if (EncodedOffset && Offset && Imm32Only) {
2073 *Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);
2074 return true;
2075 }
2076
2077 if (!isUInt<32>(ByteOffset) && !isInt<32>(ByteOffset))
2078 return false;
2079
2080 if (SOffset) {
2081 SDValue C32Bit = CurDAG->getTargetConstant(ByteOffset, SL, MVT::i32);
2082 *SOffset = SDValue(
2083 CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, C32Bit), 0);
2084 return true;
2085 }
2086
2087 return false;
2088}
2089
2090SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const {
2091 if (Addr.getValueType() != MVT::i32)
2092 return Addr;
2093
2094 // Zero-extend a 32-bit address.
2095 SDLoc SL(Addr);
2096
2099 unsigned AddrHiVal = Info->get32BitAddressHighBits();
2100 SDValue AddrHi = CurDAG->getTargetConstant(AddrHiVal, SL, MVT::i32);
2101
2102 const SDValue Ops[] = {
2103 CurDAG->getTargetConstant(AMDGPU::SReg_64_XEXECRegClassID, SL, MVT::i32),
2104 Addr,
2105 CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
2106 SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, AddrHi),
2107 0),
2108 CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32),
2109 };
2110
2111 return SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, SL, MVT::i64,
2112 Ops), 0);
2113}
2114
2115// Match a base and an immediate (if Offset is not null) or an SGPR (if
2116// SOffset is not null) or an immediate+SGPR offset. If Imm32Only is
2117// true, match only 32-bit immediate offsets available on CI.
2118bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase,
2119 SDValue *SOffset, SDValue *Offset,
2120 bool Imm32Only, bool IsBuffer,
2121 bool HasSOffset,
2122 int64_t ImmOffset) const {
2123 if (SOffset && Offset) {
2124 assert(!Imm32Only && !IsBuffer);
2125 SDValue B;
2126
2127 if (!SelectSMRDBaseOffset(Addr, B, nullptr, Offset, false, false, true))
2128 return false;
2129
2130 int64_t ImmOff = 0;
2131 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(*Offset))
2132 ImmOff = C->getSExtValue();
2133
2134 return SelectSMRDBaseOffset(B, SBase, SOffset, nullptr, false, false, true,
2135 ImmOff);
2136 }
2137
2138 // A 32-bit (address + offset) should not cause unsigned 32-bit integer
2139 // wraparound, because s_load instructions perform the addition in 64 bits.
2140 if (Addr.getValueType() == MVT::i32 && Addr.getOpcode() == ISD::ADD &&
2141 !Addr->getFlags().hasNoUnsignedWrap())
2142 return false;
2143
2144 SDValue N0, N1;
2145 // Extract the base and offset if possible.
2146 if (CurDAG->isBaseWithConstantOffset(Addr) || Addr.getOpcode() == ISD::ADD) {
2147 N0 = Addr.getOperand(0);
2148 N1 = Addr.getOperand(1);
2149 } else if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, N0, N1)) {
2150 assert(N0 && N1 && isa<ConstantSDNode>(N1));
2151 }
2152 if (!N0 || !N1)
2153 return false;
2154
2155 if (SelectSMRDOffset(N1, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
2156 ImmOffset)) {
2157 SBase = N0;
2158 return true;
2159 }
2160 if (SelectSMRDOffset(N0, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
2161 ImmOffset)) {
2162 SBase = N1;
2163 return true;
2164 }
2165 return false;
2166}
2167
2168bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase,
2169 SDValue *SOffset, SDValue *Offset,
2170 bool Imm32Only) const {
2171 if (SelectSMRDBaseOffset(Addr, SBase, SOffset, Offset, Imm32Only)) {
2172 SBase = Expand32BitAddress(SBase);
2173 return true;
2174 }
2175
2176 if (Addr.getValueType() == MVT::i32 && Offset && !SOffset) {
2177 SBase = Expand32BitAddress(Addr);
2178 *Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
2179 return true;
2180 }
2181
2182 return false;
2183}
2184
2185bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr, SDValue &SBase,
2186 SDValue &Offset) const {
2187 return SelectSMRD(Addr, SBase, /* SOffset */ nullptr, &Offset);
2188}
2189
2190bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase,
2191 SDValue &Offset) const {
2193 return SelectSMRD(Addr, SBase, /* SOffset */ nullptr, &Offset,
2194 /* Imm32Only */ true);
2195}
2196
2197bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDValue Addr, SDValue &SBase,
2198 SDValue &SOffset) const {
2199 return SelectSMRD(Addr, SBase, &SOffset, /* Offset */ nullptr);
2200}
2201
2202bool AMDGPUDAGToDAGISel::SelectSMRDSgprImm(SDValue Addr, SDValue &SBase,
2203 SDValue &SOffset,
2204 SDValue &Offset) const {
2205 return SelectSMRD(Addr, SBase, &SOffset, &Offset);
2206}
2207
2208bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue N, SDValue &Offset) const {
2209 return SelectSMRDOffset(N, /* SOffset */ nullptr, &Offset,
2210 /* Imm32Only */ false, /* IsBuffer */ true);
2211}
2212
2213bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue N,
2214 SDValue &Offset) const {
2216 return SelectSMRDOffset(N, /* SOffset */ nullptr, &Offset,
2217 /* Imm32Only */ true, /* IsBuffer */ true);
2218}
2219
2220bool AMDGPUDAGToDAGISel::SelectSMRDBufferSgprImm(SDValue N, SDValue &SOffset,
2221 SDValue &Offset) const {
2222 // Match the (soffset + offset) pair as a 32-bit register base and
2223 // an immediate offset.
2224 return N.getValueType() == MVT::i32 &&
2225 SelectSMRDBaseOffset(N, /* SBase */ SOffset, /* SOffset*/ nullptr,
2226 &Offset, /* Imm32Only */ false,
2227 /* IsBuffer */ true);
2228}
2229
2230bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index,
2231 SDValue &Base,
2232 SDValue &Offset) const {
2233 SDLoc DL(Index);
2234
2235 if (CurDAG->isBaseWithConstantOffset(Index)) {
2236 SDValue N0 = Index.getOperand(0);
2237 SDValue N1 = Index.getOperand(1);
2238 ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
2239
2240 // (add n0, c0)
2241 // Don't peel off the offset (c0) if doing so could possibly lead
2242 // the base (n0) to be negative.
2243 // (or n0, |c0|) can never change a sign given isBaseWithConstantOffset.
2244 if (C1->getSExtValue() <= 0 || CurDAG->SignBitIsZero(N0) ||
2245 (Index->getOpcode() == ISD::OR && C1->getSExtValue() >= 0)) {
2246 Base = N0;
2247 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
2248 return true;
2249 }
2250 }
2251
2252 if (isa<ConstantSDNode>(Index))
2253 return false;
2254
2255 Base = Index;
2256 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
2257 return true;
2258}
2259
2260SDNode *AMDGPUDAGToDAGISel::getBFE32(bool IsSigned, const SDLoc &DL,
2261 SDValue Val, uint32_t Offset,
2262 uint32_t Width) {
2263 if (Val->isDivergent()) {
2264 unsigned Opcode = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2266 SDValue W = CurDAG->getTargetConstant(Width, DL, MVT::i32);
2267
2268 return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, Off, W);
2269 }
2270 unsigned Opcode = IsSigned ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2271 // Transformation function, pack the offset and width of a BFE into
2272 // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
2273 // source, bits [5:0] contain the offset and bits [22:16] the width.
2274 uint32_t PackedVal = Offset | (Width << 16);
2275 SDValue PackedConst = CurDAG->getTargetConstant(PackedVal, DL, MVT::i32);
2276
2277 return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, PackedConst);
2278}
2279
2280void AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode *N) {
2281 // "(a << b) srl c)" ---> "BFE_U32 a, (c-b), (32-c)
2282 // "(a << b) sra c)" ---> "BFE_I32 a, (c-b), (32-c)
2283 // Predicate: 0 < b <= c < 32
2284
2285 const SDValue &Shl = N->getOperand(0);
2286 ConstantSDNode *B = dyn_cast<ConstantSDNode>(Shl->getOperand(1));
2287 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
2288
2289 if (B && C) {
2290 uint32_t BVal = B->getZExtValue();
2291 uint32_t CVal = C->getZExtValue();
2292
2293 if (0 < BVal && BVal <= CVal && CVal < 32) {
2294 bool Signed = N->getOpcode() == ISD::SRA;
2295 ReplaceNode(N, getBFE32(Signed, SDLoc(N), Shl.getOperand(0), CVal - BVal,
2296 32 - CVal));
2297 return;
2298 }
2299 }
2300 SelectCode(N);
2301}
2302
2303void AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) {
2304 switch (N->getOpcode()) {
2305 case ISD::AND:
2306 if (N->getOperand(0).getOpcode() == ISD::SRL) {
2307 // "(a srl b) & mask" ---> "BFE_U32 a, b, popcount(mask)"
2308 // Predicate: isMask(mask)
2309 const SDValue &Srl = N->getOperand(0);
2310 ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(Srl.getOperand(1));
2311 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(N->getOperand(1));
2312
2313 if (Shift && Mask) {
2314 uint32_t ShiftVal = Shift->getZExtValue();
2315 uint32_t MaskVal = Mask->getZExtValue();
2316
2317 if (isMask_32(MaskVal)) {
2318 uint32_t WidthVal = llvm::popcount(MaskVal);
2319 ReplaceNode(N, getBFE32(false, SDLoc(N), Srl.getOperand(0), ShiftVal,
2320 WidthVal));
2321 return;
2322 }
2323 }
2324 }
2325 break;
2326 case ISD::SRL:
2327 if (N->getOperand(0).getOpcode() == ISD::AND) {
2328 // "(a & mask) srl b)" ---> "BFE_U32 a, b, popcount(mask >> b)"
2329 // Predicate: isMask(mask >> b)
2330 const SDValue &And = N->getOperand(0);
2331 ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(N->getOperand(1));
2332 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(And->getOperand(1));
2333
2334 if (Shift && Mask) {
2335 uint32_t ShiftVal = Shift->getZExtValue();
2336 uint32_t MaskVal = Mask->getZExtValue() >> ShiftVal;
2337
2338 if (isMask_32(MaskVal)) {
2339 uint32_t WidthVal = llvm::popcount(MaskVal);
2340 ReplaceNode(N, getBFE32(false, SDLoc(N), And.getOperand(0), ShiftVal,
2341 WidthVal));
2342 return;
2343 }
2344 }
2345 } else if (N->getOperand(0).getOpcode() == ISD::SHL) {
2346 SelectS_BFEFromShifts(N);
2347 return;
2348 }
2349 break;
2350 case ISD::SRA:
2351 if (N->getOperand(0).getOpcode() == ISD::SHL) {
2352 SelectS_BFEFromShifts(N);
2353 return;
2354 }
2355 break;
2356
2358 // sext_inreg (srl x, 16), i8 -> bfe_i32 x, 16, 8
2359 SDValue Src = N->getOperand(0);
2360 if (Src.getOpcode() != ISD::SRL)
2361 break;
2362
2363 const ConstantSDNode *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(1));
2364 if (!Amt)
2365 break;
2366
2367 unsigned Width = cast<VTSDNode>(N->getOperand(1))->getVT().getSizeInBits();
2368 ReplaceNode(N, getBFE32(true, SDLoc(N), Src.getOperand(0),
2369 Amt->getZExtValue(), Width));
2370 return;
2371 }
2372 }
2373
2374 SelectCode(N);
2375}
2376
2377bool AMDGPUDAGToDAGISel::isCBranchSCC(const SDNode *N) const {
2378 assert(N->getOpcode() == ISD::BRCOND);
2379 if (!N->hasOneUse())
2380 return false;
2381
2382 SDValue Cond = N->getOperand(1);
2383 if (Cond.getOpcode() == ISD::CopyToReg)
2384 Cond = Cond.getOperand(2);
2385
2386 if (Cond.getOpcode() != ISD::SETCC || !Cond.hasOneUse())
2387 return false;
2388
2389 MVT VT = Cond.getOperand(0).getSimpleValueType();
2390 if (VT == MVT::i32)
2391 return true;
2392
2393 if (VT == MVT::i64) {
2394 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
2395 return (CC == ISD::SETEQ || CC == ISD::SETNE) &&
2396 Subtarget->hasScalarCompareEq64();
2397 }
2398
2399 if ((VT == MVT::f16 || VT == MVT::f32) && Subtarget->hasSALUFloatInsts())
2400 return true;
2401
2402 return false;
2403}
2404
2405static SDValue combineBallotPattern(SDValue VCMP, bool &Negate) {
2406 assert(VCMP->getOpcode() == AMDGPUISD::SETCC);
2407 // Special case for amdgcn.ballot:
2408 // %Cond = i1 (and/or combination of i1 ISD::SETCCs)
2409 // %VCMP = i(WaveSize) AMDGPUISD::SETCC (ext %Cond), 0, setne/seteq
2410 // =>
2411 // Use i1 %Cond value instead of i(WaveSize) %VCMP.
2412 // This is possible because divergent ISD::SETCC is selected as V_CMP and
2413 // Cond becomes a i(WaveSize) full mask value.
2414 // Note that ballot doesn't use SETEQ condition but its easy to support it
2415 // here for completeness, so in this case Negate is set true on return.
2416 auto VCMP_CC = cast<CondCodeSDNode>(VCMP.getOperand(2))->get();
2417 if ((VCMP_CC == ISD::SETEQ || VCMP_CC == ISD::SETNE) &&
2418 isNullConstant(VCMP.getOperand(1))) {
2419
2420 auto Cond = VCMP.getOperand(0);
2421 if (ISD::isExtOpcode(Cond->getOpcode())) // Skip extension.
2422 Cond = Cond.getOperand(0);
2423
2424 if (isBoolSGPR(Cond)) {
2425 Negate = VCMP_CC == ISD::SETEQ;
2426 return Cond;
2427 }
2428 }
2429 return SDValue();
2430}
2431
2432void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
2433 SDValue Cond = N->getOperand(1);
2434
2435 if (Cond.isUndef()) {
2436 CurDAG->SelectNodeTo(N, AMDGPU::SI_BR_UNDEF, MVT::Other,
2437 N->getOperand(2), N->getOperand(0));
2438 return;
2439 }
2440
2441 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
2442
2443 bool UseSCCBr = isCBranchSCC(N) && isUniformBr(N);
2444 bool AndExec = !UseSCCBr;
2445 bool Negate = false;
2446
2447 if (Cond.getOpcode() == ISD::SETCC &&
2448 Cond->getOperand(0)->getOpcode() == AMDGPUISD::SETCC) {
2449 SDValue VCMP = Cond->getOperand(0);
2450 auto CC = cast<CondCodeSDNode>(Cond->getOperand(2))->get();
2451 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
2452 isNullConstant(Cond->getOperand(1)) &&
2453 // We may encounter ballot.i64 in wave32 mode on -O0.
2454 VCMP.getValueType().getSizeInBits() == Subtarget->getWavefrontSize()) {
2455 // %VCMP = i(WaveSize) AMDGPUISD::SETCC ...
2456 // %C = i1 ISD::SETCC %VCMP, 0, setne/seteq
2457 // BRCOND i1 %C, %BB
2458 // =>
2459 // %VCMP = i(WaveSize) AMDGPUISD::SETCC ...
2460 // VCC = COPY i(WaveSize) %VCMP
2461 // S_CBRANCH_VCCNZ/VCCZ %BB
2462 Negate = CC == ISD::SETEQ;
2463 bool NegatedBallot = false;
2464 if (auto BallotCond = combineBallotPattern(VCMP, NegatedBallot)) {
2465 Cond = BallotCond;
2466 UseSCCBr = !BallotCond->isDivergent();
2467 Negate = Negate ^ NegatedBallot;
2468 } else {
2469 // TODO: don't use SCC here assuming that AMDGPUISD::SETCC is always
2470 // selected as V_CMP, but this may change for uniform condition.
2471 Cond = VCMP;
2472 UseSCCBr = false;
2473 }
2474 }
2475 // Cond is either V_CMP resulted from AMDGPUISD::SETCC or a combination of
2476 // V_CMPs resulted from ballot or ballot has uniform condition and SCC is
2477 // used.
2478 AndExec = false;
2479 }
2480
2481 unsigned BrOp =
2482 UseSCCBr ? (Negate ? AMDGPU::S_CBRANCH_SCC0 : AMDGPU::S_CBRANCH_SCC1)
2483 : (Negate ? AMDGPU::S_CBRANCH_VCCZ : AMDGPU::S_CBRANCH_VCCNZ);
2484 Register CondReg = UseSCCBr ? AMDGPU::SCC : TRI->getVCC();
2485 SDLoc SL(N);
2486
2487 if (AndExec) {
2488 // This is the case that we are selecting to S_CBRANCH_VCCNZ. We have not
2489 // analyzed what generates the vcc value, so we do not know whether vcc
2490 // bits for disabled lanes are 0. Thus we need to mask out bits for
2491 // disabled lanes.
2492 //
2493 // For the case that we select S_CBRANCH_SCC1 and it gets
2494 // changed to S_CBRANCH_VCCNZ in SIFixSGPRCopies, SIFixSGPRCopies calls
2495 // SIInstrInfo::moveToVALU which inserts the S_AND).
2496 //
2497 // We could add an analysis of what generates the vcc value here and omit
2498 // the S_AND when is unnecessary. But it would be better to add a separate
2499 // pass after SIFixSGPRCopies to do the unnecessary S_AND removal, so it
2500 // catches both cases.
2501 Cond = SDValue(
2503 Subtarget->isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64, SL,
2504 MVT::i1,
2505 CurDAG->getRegister(Subtarget->isWave32() ? AMDGPU::EXEC_LO
2506 : AMDGPU::EXEC,
2507 MVT::i1),
2508 Cond),
2509 0);
2510 }
2511
2512 SDValue VCC = CurDAG->getCopyToReg(N->getOperand(0), SL, CondReg, Cond);
2513 CurDAG->SelectNodeTo(N, BrOp, MVT::Other,
2514 N->getOperand(2), // Basic Block
2515 VCC.getValue(0));
2516}
2517
2518void AMDGPUDAGToDAGISel::SelectFP_EXTEND(SDNode *N) {
2519 if (Subtarget->hasSALUFloatInsts() && N->getValueType(0) == MVT::f32 &&
2520 !N->isDivergent()) {
2521 SDValue Src = N->getOperand(0);
2522 if (Src.getValueType() == MVT::f16) {
2523 if (isExtractHiElt(Src, Src)) {
2524 CurDAG->SelectNodeTo(N, AMDGPU::S_CVT_HI_F32_F16, N->getVTList(),
2525 {Src});
2526 return;
2527 }
2528 }
2529 }
2530
2531 SelectCode(N);
2532}
2533
2534void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode *N, unsigned IntrID) {
2535 // The address is assumed to be uniform, so if it ends up in a VGPR, it will
2536 // be copied to an SGPR with readfirstlane.
2537 unsigned Opc = IntrID == Intrinsic::amdgcn_ds_append ?
2538 AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
2539
2540 SDValue Chain = N->getOperand(0);
2541 SDValue Ptr = N->getOperand(2);
2542 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2543 MachineMemOperand *MMO = M->getMemOperand();
2544 bool IsGDS = M->getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
2545
2548 SDValue PtrBase = Ptr.getOperand(0);
2549 SDValue PtrOffset = Ptr.getOperand(1);
2550
2551 const APInt &OffsetVal = PtrOffset->getAsAPIntVal();
2552 if (isDSOffsetLegal(PtrBase, OffsetVal.getZExtValue())) {
2553 N = glueCopyToM0(N, PtrBase);
2554 Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i32);
2555 }
2556 }
2557
2558 if (!Offset) {
2559 N = glueCopyToM0(N, Ptr);
2560 Offset = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32);
2561 }
2562
2563 SDValue Ops[] = {
2564 Offset,
2565 CurDAG->getTargetConstant(IsGDS, SDLoc(), MVT::i32),
2566 Chain,
2567 N->getOperand(N->getNumOperands() - 1) // New glue
2568 };
2569
2570 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
2571 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
2572}
2573
2574// We need to handle this here because tablegen doesn't support matching
2575// instructions with multiple outputs.
2576void AMDGPUDAGToDAGISel::SelectDSBvhStackIntrinsic(SDNode *N) {
2577 unsigned Opc = AMDGPU::DS_BVH_STACK_RTN_B32;
2578 SDValue Ops[] = {N->getOperand(2), N->getOperand(3), N->getOperand(4),
2579 N->getOperand(5), N->getOperand(0)};
2580
2581 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2582 MachineMemOperand *MMO = M->getMemOperand();
2583 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
2584 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
2585}
2586
2587static unsigned gwsIntrinToOpcode(unsigned IntrID) {
2588 switch (IntrID) {
2589 case Intrinsic::amdgcn_ds_gws_init:
2590 return AMDGPU::DS_GWS_INIT;
2591 case Intrinsic::amdgcn_ds_gws_barrier:
2592 return AMDGPU::DS_GWS_BARRIER;
2593 case Intrinsic::amdgcn_ds_gws_sema_v:
2594 return AMDGPU::DS_GWS_SEMA_V;
2595 case Intrinsic::amdgcn_ds_gws_sema_br:
2596 return AMDGPU::DS_GWS_SEMA_BR;
2597 case Intrinsic::amdgcn_ds_gws_sema_p:
2598 return AMDGPU::DS_GWS_SEMA_P;
2599 case Intrinsic::amdgcn_ds_gws_sema_release_all:
2600 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
2601 default:
2602 llvm_unreachable("not a gws intrinsic");
2603 }
2604}
2605
2606void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) {
2607 if (!Subtarget->hasGWS() ||
2608 (IntrID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
2609 !Subtarget->hasGWSSemaReleaseAll())) {
2610 // Let this error.
2611 SelectCode(N);
2612 return;
2613 }
2614
2615 // Chain, intrinsic ID, vsrc, offset
2616 const bool HasVSrc = N->getNumOperands() == 4;
2617 assert(HasVSrc || N->getNumOperands() == 3);
2618
2619 SDLoc SL(N);
2620 SDValue BaseOffset = N->getOperand(HasVSrc ? 3 : 2);
2621 int ImmOffset = 0;
2622 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2623 MachineMemOperand *MMO = M->getMemOperand();
2624
2625 // Don't worry if the offset ends up in a VGPR. Only one lane will have
2626 // effect, so SIFixSGPRCopies will validly insert readfirstlane.
2627
2628 // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
2629 // offset field) % 64. Some versions of the programming guide omit the m0
2630 // part, or claim it's from offset 0.
2631 if (ConstantSDNode *ConstOffset = dyn_cast<ConstantSDNode>(BaseOffset)) {
2632 // If we have a constant offset, try to use the 0 in m0 as the base.
2633 // TODO: Look into changing the default m0 initialization value. If the
2634 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
2635 // the immediate offset.
2636 glueCopyToM0(N, CurDAG->getTargetConstant(0, SL, MVT::i32));
2637 ImmOffset = ConstOffset->getZExtValue();
2638 } else {
2639 if (CurDAG->isBaseWithConstantOffset(BaseOffset)) {
2640 ImmOffset = BaseOffset.getConstantOperandVal(1);
2641 BaseOffset = BaseOffset.getOperand(0);
2642 }
2643
2644 // Prefer to do the shift in an SGPR since it should be possible to use m0
2645 // as the result directly. If it's already an SGPR, it will be eliminated
2646 // later.
2647 SDNode *SGPROffset
2648 = CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL, MVT::i32,
2649 BaseOffset);
2650 // Shift to offset in m0
2651 SDNode *M0Base
2652 = CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,
2653 SDValue(SGPROffset, 0),
2654 CurDAG->getTargetConstant(16, SL, MVT::i32));
2655 glueCopyToM0(N, SDValue(M0Base, 0));
2656 }
2657
2658 SDValue Chain = N->getOperand(0);
2659 SDValue OffsetField = CurDAG->getTargetConstant(ImmOffset, SL, MVT::i32);
2660
2661 const unsigned Opc = gwsIntrinToOpcode(IntrID);
2663 if (HasVSrc)
2664 Ops.push_back(N->getOperand(2));
2665 Ops.push_back(OffsetField);
2666 Ops.push_back(Chain);
2667
2668 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
2669 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
2670}
2671
2672void AMDGPUDAGToDAGISel::SelectInterpP1F16(SDNode *N) {
2673 if (Subtarget->getLDSBankCount() != 16) {
2674 // This is a single instruction with a pattern.
2675 SelectCode(N);
2676 return;
2677 }
2678
2679 SDLoc DL(N);
2680
2681 // This requires 2 instructions. It is possible to write a pattern to support
2682 // this, but the generated isel emitter doesn't correctly deal with multiple
2683 // output instructions using the same physical register input. The copy to m0
2684 // is incorrectly placed before the second instruction.
2685 //
2686 // TODO: Match source modifiers.
2687 //
2688 // def : Pat <
2689 // (int_amdgcn_interp_p1_f16
2690 // (VOP3Mods f32:$src0, i32:$src0_modifiers),
2691 // (i32 timm:$attrchan), (i32 timm:$attr),
2692 // (i1 timm:$high), M0),
2693 // (V_INTERP_P1LV_F16 $src0_modifiers, VGPR_32:$src0, timm:$attr,
2694 // timm:$attrchan, 0,
2695 // (V_INTERP_MOV_F32 2, timm:$attr, timm:$attrchan), timm:$high)> {
2696 // let Predicates = [has16BankLDS];
2697 // }
2698
2699 // 16 bank LDS
2700 SDValue ToM0 = CurDAG->getCopyToReg(CurDAG->getEntryNode(), DL, AMDGPU::M0,
2701 N->getOperand(5), SDValue());
2702
2703 SDVTList VTs = CurDAG->getVTList(MVT::f32, MVT::Other);
2704
2705 SDNode *InterpMov =
2706 CurDAG->getMachineNode(AMDGPU::V_INTERP_MOV_F32, DL, VTs, {
2707 CurDAG->getTargetConstant(2, DL, MVT::i32), // P0
2708 N->getOperand(3), // Attr
2709 N->getOperand(2), // Attrchan
2710 ToM0.getValue(1) // In glue
2711 });
2712
2713 SDNode *InterpP1LV =
2714 CurDAG->getMachineNode(AMDGPU::V_INTERP_P1LV_F16, DL, MVT::f32, {
2715 CurDAG->getTargetConstant(0, DL, MVT::i32), // $src0_modifiers
2716 N->getOperand(1), // Src0
2717 N->getOperand(3), // Attr
2718 N->getOperand(2), // Attrchan
2719 CurDAG->getTargetConstant(0, DL, MVT::i32), // $src2_modifiers
2720 SDValue(InterpMov, 0), // Src2 - holds two f16 values selected by high
2721 N->getOperand(4), // high
2722 CurDAG->getTargetConstant(0, DL, MVT::i1), // $clamp
2723 CurDAG->getTargetConstant(0, DL, MVT::i32), // $omod
2724 SDValue(InterpMov, 1)
2725 });
2726
2727 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), SDValue(InterpP1LV, 0));
2728}
2729
2730void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
2731 unsigned IntrID = N->getConstantOperandVal(1);
2732 switch (IntrID) {
2733 case Intrinsic::amdgcn_ds_append:
2734 case Intrinsic::amdgcn_ds_consume: {
2735 if (N->getValueType(0) != MVT::i32)
2736 break;
2737 SelectDSAppendConsume(N, IntrID);
2738 return;
2739 }
2740 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2741 SelectDSBvhStackIntrinsic(N);
2742 return;
2743 case Intrinsic::amdgcn_init_whole_wave:
2746 ->setInitWholeWave();
2747 break;
2748 }
2749
2750 SelectCode(N);
2751}
2752
2753void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) {
2754 unsigned IntrID = N->getConstantOperandVal(0);
2755 unsigned Opcode = AMDGPU::INSTRUCTION_LIST_END;
2756 SDNode *ConvGlueNode = N->getGluedNode();
2757 if (ConvGlueNode) {
2758 // FIXME: Possibly iterate over multiple glue nodes?
2759 assert(ConvGlueNode->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
2760 ConvGlueNode = ConvGlueNode->getOperand(0).getNode();
2761 ConvGlueNode =
2762 CurDAG->getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, {},
2763 MVT::Glue, SDValue(ConvGlueNode, 0));
2764 } else {
2765 ConvGlueNode = nullptr;
2766 }
2767 switch (IntrID) {
2768 case Intrinsic::amdgcn_wqm:
2769 Opcode = AMDGPU::WQM;
2770 break;
2771 case Intrinsic::amdgcn_softwqm:
2772 Opcode = AMDGPU::SOFT_WQM;
2773 break;
2774 case Intrinsic::amdgcn_wwm:
2775 case Intrinsic::amdgcn_strict_wwm:
2776 Opcode = AMDGPU::STRICT_WWM;
2777 break;
2778 case Intrinsic::amdgcn_strict_wqm:
2779 Opcode = AMDGPU::STRICT_WQM;
2780 break;
2781 case Intrinsic::amdgcn_interp_p1_f16:
2782 SelectInterpP1F16(N);
2783 return;
2784 case Intrinsic::amdgcn_permlane16_swap:
2785 case Intrinsic::amdgcn_permlane32_swap: {
2786 if ((IntrID == Intrinsic::amdgcn_permlane16_swap &&
2787 !Subtarget->hasPermlane16Swap()) ||
2788 (IntrID == Intrinsic::amdgcn_permlane32_swap &&
2789 !Subtarget->hasPermlane32Swap())) {
2790 SelectCode(N); // Hit the default error
2791 return;
2792 }
2793
2794 Opcode = IntrID == Intrinsic::amdgcn_permlane16_swap
2795 ? AMDGPU::V_PERMLANE16_SWAP_B32_e64
2796 : AMDGPU::V_PERMLANE32_SWAP_B32_e64;
2797
2798 SmallVector<SDValue, 4> NewOps(N->op_begin() + 1, N->op_end());
2799 if (ConvGlueNode)
2800 NewOps.push_back(SDValue(ConvGlueNode, 0));
2801
2802 bool FI = N->getConstantOperandVal(3);
2803 NewOps[2] = CurDAG->getTargetConstant(
2805
2806 CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), NewOps);
2807 return;
2808 }
2809 default:
2810 SelectCode(N);
2811 break;
2812 }
2813
2814 if (Opcode != AMDGPU::INSTRUCTION_LIST_END) {
2815 SDValue Src = N->getOperand(1);
2816 CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), {Src});
2817 }
2818
2819 if (ConvGlueNode) {
2820 SmallVector<SDValue, 4> NewOps(N->ops());
2821 NewOps.push_back(SDValue(ConvGlueNode, 0));
2822 CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), NewOps);
2823 }
2824}
2825
2826void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) {
2827 unsigned IntrID = N->getConstantOperandVal(1);
2828 switch (IntrID) {
2829 case Intrinsic::amdgcn_ds_gws_init:
2830 case Intrinsic::amdgcn_ds_gws_barrier:
2831 case Intrinsic::amdgcn_ds_gws_sema_v:
2832 case Intrinsic::amdgcn_ds_gws_sema_br:
2833 case Intrinsic::amdgcn_ds_gws_sema_p:
2834 case Intrinsic::amdgcn_ds_gws_sema_release_all:
2835 SelectDS_GWS(N, IntrID);
2836 return;
2837 default:
2838 break;
2839 }
2840
2841 SelectCode(N);
2842}
2843
2844void AMDGPUDAGToDAGISel::SelectWAVE_ADDRESS(SDNode *N) {
2845 SDValue Log2WaveSize =
2846 CurDAG->getTargetConstant(Subtarget->getWavefrontSizeLog2(), SDLoc(N), MVT::i32);
2847 CurDAG->SelectNodeTo(N, AMDGPU::S_LSHR_B32, N->getVTList(),
2848 {N->getOperand(0), Log2WaveSize});
2849}
2850
2851void AMDGPUDAGToDAGISel::SelectSTACKRESTORE(SDNode *N) {
2852 SDValue SrcVal = N->getOperand(1);
2853 if (SrcVal.getValueType() != MVT::i32) {
2854 SelectCode(N); // Emit default error
2855 return;
2856 }
2857
2858 SDValue CopyVal;
2860 SDLoc SL(N);
2861
2862 if (SrcVal.getOpcode() == AMDGPUISD::WAVE_ADDRESS) {
2863 CopyVal = SrcVal.getOperand(0);
2864 } else {
2865 SDValue Log2WaveSize = CurDAG->getTargetConstant(
2866 Subtarget->getWavefrontSizeLog2(), SL, MVT::i32);
2867
2868 if (N->isDivergent()) {
2869 SrcVal = SDValue(CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL,
2870 MVT::i32, SrcVal),
2871 0);
2872 }
2873
2874 CopyVal = SDValue(CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,
2875 {SrcVal, Log2WaveSize}),
2876 0);
2877 }
2878
2879 SDValue CopyToSP = CurDAG->getCopyToReg(N->getOperand(0), SL, SP, CopyVal);
2880 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), CopyToSP);
2881}
2882
2883bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src,
2884 unsigned &Mods,
2885 bool IsCanonicalizing,
2886 bool AllowAbs) const {
2887 Mods = SISrcMods::NONE;
2888 Src = In;
2889
2890 if (Src.getOpcode() == ISD::FNEG) {
2891 Mods |= SISrcMods::NEG;
2892 Src = Src.getOperand(0);
2893 } else if (Src.getOpcode() == ISD::FSUB && IsCanonicalizing) {
2894 // Fold fsub [+-]0 into fneg. This may not have folded depending on the
2895 // denormal mode, but we're implicitly canonicalizing in a source operand.
2896 auto *LHS = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
2897 if (LHS && LHS->isZero()) {
2898 Mods |= SISrcMods::NEG;
2899 Src = Src.getOperand(1);
2900 }
2901 }
2902
2903 if (AllowAbs && Src.getOpcode() == ISD::FABS) {
2904 Mods |= SISrcMods::ABS;
2905 Src = Src.getOperand(0);
2906 }
2907
2908 return true;
2909}
2910
2911bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src,
2912 SDValue &SrcMods) const {
2913 unsigned Mods;
2914 if (SelectVOP3ModsImpl(In, Src, Mods, /*IsCanonicalizing=*/true,
2915 /*AllowAbs=*/true)) {
2916 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2917 return true;
2918 }
2919
2920 return false;
2921}
2922
2923bool AMDGPUDAGToDAGISel::SelectVOP3ModsNonCanonicalizing(
2924 SDValue In, SDValue &Src, SDValue &SrcMods) const {
2925 unsigned Mods;
2926 if (SelectVOP3ModsImpl(In, Src, Mods, /*IsCanonicalizing=*/false,
2927 /*AllowAbs=*/true)) {
2928 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2929 return true;
2930 }
2931
2932 return false;
2933}
2934
2935bool AMDGPUDAGToDAGISel::SelectVOP3BMods(SDValue In, SDValue &Src,
2936 SDValue &SrcMods) const {
2937 unsigned Mods;
2938 if (SelectVOP3ModsImpl(In, Src, Mods,
2939 /*IsCanonicalizing=*/true,
2940 /*AllowAbs=*/false)) {
2941 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2942 return true;
2943 }
2944
2945 return false;
2946}
2947
2948bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src) const {
2949 if (In.getOpcode() == ISD::FABS || In.getOpcode() == ISD::FNEG)
2950 return false;
2951
2952 Src = In;
2953 return true;
2954}
2955
2956bool AMDGPUDAGToDAGISel::SelectVINTERPModsImpl(SDValue In, SDValue &Src,
2957 SDValue &SrcMods,
2958 bool OpSel) const {
2959 unsigned Mods;
2960 if (SelectVOP3ModsImpl(In, Src, Mods,
2961 /*IsCanonicalizing=*/true,
2962 /*AllowAbs=*/false)) {
2963 if (OpSel)
2964 Mods |= SISrcMods::OP_SEL_0;
2965 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2966 return true;
2967 }
2968
2969 return false;
2970}
2971
2972bool AMDGPUDAGToDAGISel::SelectVINTERPMods(SDValue In, SDValue &Src,
2973 SDValue &SrcMods) const {
2974 return SelectVINTERPModsImpl(In, Src, SrcMods, /* OpSel */ false);
2975}
2976
2977bool AMDGPUDAGToDAGISel::SelectVINTERPModsHi(SDValue In, SDValue &Src,
2978 SDValue &SrcMods) const {
2979 return SelectVINTERPModsImpl(In, Src, SrcMods, /* OpSel */ true);
2980}
2981
2982bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src,
2983 SDValue &SrcMods, SDValue &Clamp,
2984 SDValue &Omod) const {
2985 SDLoc DL(In);
2986 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
2987 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
2988
2989 return SelectVOP3Mods(In, Src, SrcMods);
2990}
2991
2992bool AMDGPUDAGToDAGISel::SelectVOP3BMods0(SDValue In, SDValue &Src,
2993 SDValue &SrcMods, SDValue &Clamp,
2994 SDValue &Omod) const {
2995 SDLoc DL(In);
2996 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
2997 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
2998
2999 return SelectVOP3BMods(In, Src, SrcMods);
3000}
3001
3002bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In, SDValue &Src,
3003 SDValue &Clamp, SDValue &Omod) const {
3004 Src = In;
3005
3006 SDLoc DL(In);
3007 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
3008 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
3009
3010 return true;
3011}
3012
3013bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
3014 SDValue &SrcMods, bool IsDOT) const {
3015 unsigned Mods = SISrcMods::NONE;
3016 Src = In;
3017
3018 // TODO: Handle G_FSUB 0 as fneg
3019 if (Src.getOpcode() == ISD::FNEG) {
3021 Src = Src.getOperand(0);
3022 }
3023
3024 if (Src.getOpcode() == ISD::BUILD_VECTOR && Src.getNumOperands() == 2 &&
3025 (!IsDOT || !Subtarget->hasDOTOpSelHazard())) {
3026 unsigned VecMods = Mods;
3027
3028 SDValue Lo = stripBitcast(Src.getOperand(0));
3029 SDValue Hi = stripBitcast(Src.getOperand(1));
3030
3031 if (Lo.getOpcode() == ISD::FNEG) {
3032 Lo = stripBitcast(Lo.getOperand(0));
3033 Mods ^= SISrcMods::NEG;
3034 }
3035
3036 if (Hi.getOpcode() == ISD::FNEG) {
3037 Hi = stripBitcast(Hi.getOperand(0));
3038 Mods ^= SISrcMods::NEG_HI;
3039 }
3040
3041 if (isExtractHiElt(Lo, Lo))
3042 Mods |= SISrcMods::OP_SEL_0;
3043
3044 if (isExtractHiElt(Hi, Hi))
3045 Mods |= SISrcMods::OP_SEL_1;
3046
3047 unsigned VecSize = Src.getValueSizeInBits();
3048 Lo = stripExtractLoElt(Lo);
3049 Hi = stripExtractLoElt(Hi);
3050
3051 if (Lo.getValueSizeInBits() > VecSize) {
3053 (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
3054 MVT::getIntegerVT(VecSize), Lo);
3055 }
3056
3057 if (Hi.getValueSizeInBits() > VecSize) {
3059 (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
3060 MVT::getIntegerVT(VecSize), Hi);
3061 }
3062
3063 assert(Lo.getValueSizeInBits() <= VecSize &&
3064 Hi.getValueSizeInBits() <= VecSize);
3065
3066 if (Lo == Hi && !isInlineImmediate(Lo.getNode())) {
3067 // Really a scalar input. Just select from the low half of the register to
3068 // avoid packing.
3069
3070 if (VecSize == 32 || VecSize == Lo.getValueSizeInBits()) {
3071 Src = Lo;
3072 } else {
3073 assert(Lo.getValueSizeInBits() == 32 && VecSize == 64);
3074
3075 SDLoc SL(In);
3077 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, SL,
3078 Lo.getValueType()), 0);
3079 auto RC = Lo->isDivergent() ? AMDGPU::VReg_64RegClassID
3080 : AMDGPU::SReg_64RegClassID;
3081 const SDValue Ops[] = {
3082 CurDAG->getTargetConstant(RC, SL, MVT::i32),
3083 Lo, CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
3084 Undef, CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32) };
3085
3086 Src = SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, SL,
3087 Src.getValueType(), Ops), 0);
3088 }
3089 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3090 return true;
3091 }
3092
3093 if (VecSize == 64 && Lo == Hi && isa<ConstantFPSDNode>(Lo)) {
3094 uint64_t Lit = cast<ConstantFPSDNode>(Lo)->getValueAPF()
3095 .bitcastToAPInt().getZExtValue();
3096 if (AMDGPU::isInlinableLiteral32(Lit, Subtarget->hasInv2PiInlineImm())) {
3097 Src = CurDAG->getTargetConstant(Lit, SDLoc(In), MVT::i64);
3098 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3099 return true;
3100 }
3101 }
3102
3103 Mods = VecMods;
3104 }
3105
3106 // Packed instructions do not have abs modifiers.
3107 Mods |= SISrcMods::OP_SEL_1;
3108
3109 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3110 return true;
3111}
3112
3113bool AMDGPUDAGToDAGISel::SelectVOP3PModsDOT(SDValue In, SDValue &Src,
3114 SDValue &SrcMods) const {
3115 return SelectVOP3PMods(In, Src, SrcMods, true);
3116}
3117
3118bool AMDGPUDAGToDAGISel::SelectVOP3PModsNeg(SDValue In, SDValue &Src) const {
3119 const ConstantSDNode *C = cast<ConstantSDNode>(In);
3120 // Literal i1 value set in intrinsic, represents SrcMods for the next operand.
3121 // 1 promotes packed values to signed, 0 treats them as unsigned.
3122 assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value");
3123
3124 unsigned Mods = SISrcMods::OP_SEL_1;
3125 unsigned SrcSign = C->getZExtValue();
3126 if (SrcSign == 1)
3127 Mods ^= SISrcMods::NEG;
3128
3129 Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3130 return true;
3131}
3132
3133bool AMDGPUDAGToDAGISel::SelectWMMAOpSelVOP3PMods(SDValue In,
3134 SDValue &Src) const {
3135 const ConstantSDNode *C = cast<ConstantSDNode>(In);
3136 assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value");
3137
3138 unsigned Mods = SISrcMods::OP_SEL_1;
3139 unsigned SrcVal = C->getZExtValue();
3140 if (SrcVal == 1)
3141 Mods |= SISrcMods::OP_SEL_0;
3142
3143 Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3144 return true;
3145}
3146
3148 llvm::SelectionDAG *CurDAG,
3149 const SDLoc &DL) {
3150 unsigned DstRegClass;
3151 EVT DstTy;
3152 switch (Elts.size()) {
3153 case 8:
3154 DstRegClass = AMDGPU::VReg_256RegClassID;
3155 DstTy = MVT::v8i32;
3156 break;
3157 case 4:
3158 DstRegClass = AMDGPU::VReg_128RegClassID;
3159 DstTy = MVT::v4i32;
3160 break;
3161 case 2:
3162 DstRegClass = AMDGPU::VReg_64RegClassID;
3163 DstTy = MVT::v2i32;
3164 break;
3165 default:
3166 llvm_unreachable("unhandled Reg sequence size");
3167 }
3168
3170 Ops.push_back(CurDAG->getTargetConstant(DstRegClass, DL, MVT::i32));
3171 for (unsigned i = 0; i < Elts.size(); ++i) {
3172 Ops.push_back(Elts[i]);
3173 Ops.push_back(CurDAG->getTargetConstant(
3175 }
3176 return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, DstTy, Ops);
3177}
3178
3180 llvm::SelectionDAG *CurDAG,
3181 const SDLoc &DL) {
3182 SmallVector<SDValue, 8> PackedElts;
3183 assert("unhandled Reg sequence size" &&
3184 (Elts.size() == 8 || Elts.size() == 16));
3185
3186 // Pack 16-bit elements in pairs into 32-bit register. If both elements are
3187 // unpacked from 32-bit source use it, otherwise pack them using v_perm.
3188 for (unsigned i = 0; i < Elts.size(); i += 2) {
3189 SDValue LoSrc = stripExtractLoElt(stripBitcast(Elts[i]));
3190 SDValue HiSrc;
3191 if (isExtractHiElt(Elts[i + 1], HiSrc) && LoSrc == HiSrc) {
3192 PackedElts.push_back(HiSrc);
3193 } else {
3194 SDValue PackLoLo = CurDAG->getTargetConstant(0x05040100, DL, MVT::i32);
3195 MachineSDNode *Packed =
3196 CurDAG->getMachineNode(AMDGPU::V_PERM_B32_e64, DL, MVT::i32,
3197 {Elts[i + 1], Elts[i], PackLoLo});
3198 PackedElts.push_back(SDValue(Packed, 0));
3199 }
3200 }
3201
3202 return buildRegSequence32(PackedElts, CurDAG, DL);
3203}
3204
3206 llvm::SelectionDAG *CurDAG,
3207 const SDLoc &DL, unsigned ElementSize) {
3208 if (ElementSize == 16)
3209 return buildRegSequence16(Elts, CurDAG, DL);
3210 if (ElementSize == 32)
3211 return buildRegSequence32(Elts, CurDAG, DL);
3212 llvm_unreachable("Unhandled element size");
3213}
3214
3215static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods,
3217 llvm::SelectionDAG *CurDAG, const SDLoc &DL,
3218 unsigned ElementSize) {
3219 if (ModOpcode == ISD::FNEG) {
3220 Mods |= SISrcMods::NEG;
3221 // Check if all elements also have abs modifier
3222 SmallVector<SDValue, 8> NegAbsElts;
3223 for (auto El : Elts) {
3224 if (El.getOpcode() != ISD::FABS)
3225 break;
3226 NegAbsElts.push_back(El->getOperand(0));
3227 }
3228 if (Elts.size() != NegAbsElts.size()) {
3229 // Neg
3230 Src = SDValue(buildRegSequence(Elts, CurDAG, DL, ElementSize), 0);
3231 } else {
3232 // Neg and Abs
3233 Mods |= SISrcMods::NEG_HI;
3234 Src = SDValue(buildRegSequence(NegAbsElts, CurDAG, DL, ElementSize), 0);
3235 }
3236 } else {
3237 assert(ModOpcode == ISD::FABS);
3238 // Abs
3239 Mods |= SISrcMods::NEG_HI;
3240 Src = SDValue(buildRegSequence(Elts, CurDAG, DL, ElementSize), 0);
3241 }
3242}
3243
3244// Check all f16 elements for modifiers while looking through b32 and v2b16
3245// build vector, stop if element does not satisfy ModifierCheck.
3246static void
3248 std::function<bool(SDValue)> ModifierCheck) {
3249 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3250 if (auto *F16Pair =
3251 dyn_cast<BuildVectorSDNode>(stripBitcast(BV->getOperand(i)))) {
3252 for (unsigned i = 0; i < F16Pair->getNumOperands(); ++i) {
3253 SDValue ElF16 = stripBitcast(F16Pair->getOperand(i));
3254 if (!ModifierCheck(ElF16))
3255 break;
3256 }
3257 }
3258 }
3259}
3260
3261bool AMDGPUDAGToDAGISel::SelectWMMAModsF16Neg(SDValue In, SDValue &Src,
3262 SDValue &SrcMods) const {
3263 Src = In;
3264 unsigned Mods = SISrcMods::OP_SEL_1;
3265
3266 // mods are on f16 elements
3267 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3269
3270 checkWMMAElementsModifiersF16(BV, [&](SDValue Element) -> bool {
3271 if (Element.getOpcode() != ISD::FNEG)
3272 return false;
3273 EltsF16.push_back(Element.getOperand(0));
3274 return true;
3275 });
3276
3277 // All elements have neg modifier
3278 if (BV->getNumOperands() * 2 == EltsF16.size()) {
3279 Src = SDValue(buildRegSequence16(EltsF16, CurDAG, SDLoc(In)), 0);
3280 Mods |= SISrcMods::NEG;
3281 Mods |= SISrcMods::NEG_HI;
3282 }
3283 }
3284
3285 // mods are on v2f16 elements
3286 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3287 SmallVector<SDValue, 8> EltsV2F16;
3288 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3289 SDValue ElV2f16 = stripBitcast(BV->getOperand(i));
3290 // Based on first element decide which mod we match, neg or abs
3291 if (ElV2f16.getOpcode() != ISD::FNEG)
3292 break;
3293 EltsV2F16.push_back(ElV2f16.getOperand(0));
3294 }
3295
3296 // All pairs of elements have neg modifier
3297 if (BV->getNumOperands() == EltsV2F16.size()) {
3298 Src = SDValue(buildRegSequence32(EltsV2F16, CurDAG, SDLoc(In)), 0);
3299 Mods |= SISrcMods::NEG;
3300 Mods |= SISrcMods::NEG_HI;
3301 }
3302 }
3303
3304 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3305 return true;
3306}
3307
3308bool AMDGPUDAGToDAGISel::SelectWMMAModsF16NegAbs(SDValue In, SDValue &Src,
3309 SDValue &SrcMods) const {
3310 Src = In;
3311 unsigned Mods = SISrcMods::OP_SEL_1;
3312 unsigned ModOpcode;
3313
3314 // mods are on f16 elements
3315 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3317 checkWMMAElementsModifiersF16(BV, [&](SDValue ElF16) -> bool {
3318 // Based on first element decide which mod we match, neg or abs
3319 if (EltsF16.empty())
3320 ModOpcode = (ElF16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3321 if (ElF16.getOpcode() != ModOpcode)
3322 return false;
3323 EltsF16.push_back(ElF16.getOperand(0));
3324 return true;
3325 });
3326
3327 // All elements have ModOpcode modifier
3328 if (BV->getNumOperands() * 2 == EltsF16.size())
3329 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF16, Src, CurDAG, SDLoc(In),
3330 16);
3331 }
3332
3333 // mods are on v2f16 elements
3334 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3335 SmallVector<SDValue, 8> EltsV2F16;
3336
3337 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3338 SDValue ElV2f16 = stripBitcast(BV->getOperand(i));
3339 // Based on first element decide which mod we match, neg or abs
3340 if (EltsV2F16.empty())
3341 ModOpcode = (ElV2f16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3342 if (ElV2f16->getOpcode() != ModOpcode)
3343 break;
3344 EltsV2F16.push_back(ElV2f16->getOperand(0));
3345 }
3346
3347 // All elements have ModOpcode modifier
3348 if (BV->getNumOperands() == EltsV2F16.size())
3349 selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, CurDAG, SDLoc(In),
3350 32);
3351 }
3352
3353 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3354 return true;
3355}
3356
3357bool AMDGPUDAGToDAGISel::SelectWMMAModsF32NegAbs(SDValue In, SDValue &Src,
3358 SDValue &SrcMods) const {
3359 Src = In;
3360 unsigned Mods = SISrcMods::OP_SEL_1;
3362
3363 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3364 assert(BV->getNumOperands() > 0);
3365 // Based on first element decide which mod we match, neg or abs
3366 SDValue ElF32 = stripBitcast(BV->getOperand(0));
3367 unsigned ModOpcode =
3368 (ElF32.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3369 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3370 SDValue ElF32 = stripBitcast(BV->getOperand(i));
3371 if (ElF32.getOpcode() != ModOpcode)
3372 break;
3373 EltsF32.push_back(ElF32.getOperand(0));
3374 }
3375
3376 // All elements had ModOpcode modifier
3377 if (BV->getNumOperands() == EltsF32.size())
3378 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, CurDAG, SDLoc(In),
3379 32);
3380 }
3381
3382 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3383 return true;
3384}
3385
3386bool AMDGPUDAGToDAGISel::SelectWMMAVISrc(SDValue In, SDValue &Src) const {
3387 if (auto *BV = dyn_cast<BuildVectorSDNode>(In)) {
3388 BitVector UndefElements;
3389 if (SDValue Splat = BV->getSplatValue(&UndefElements))
3390 if (isInlineImmediate(Splat.getNode())) {
3391 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Splat)) {
3392 unsigned Imm = C->getAPIntValue().getSExtValue();
3393 Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32);
3394 return true;
3395 }
3396 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Splat)) {
3397 unsigned Imm = C->getValueAPF().bitcastToAPInt().getSExtValue();
3398 Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32);
3399 return true;
3400 }
3401 llvm_unreachable("unhandled Constant node");
3402 }
3403 }
3404
3405 // 16 bit splat
3406 SDValue SplatSrc32 = stripBitcast(In);
3407 if (auto *SplatSrc32BV = dyn_cast<BuildVectorSDNode>(SplatSrc32))
3408 if (SDValue Splat32 = SplatSrc32BV->getSplatValue()) {
3409 SDValue SplatSrc16 = stripBitcast(Splat32);
3410 if (auto *SplatSrc16BV = dyn_cast<BuildVectorSDNode>(SplatSrc16))
3411 if (SDValue Splat = SplatSrc16BV->getSplatValue()) {
3412 const SIInstrInfo *TII = Subtarget->getInstrInfo();
3413 std::optional<APInt> RawValue;
3414 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Splat))
3415 RawValue = C->getValueAPF().bitcastToAPInt();
3416 else if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Splat))
3417 RawValue = C->getAPIntValue();
3418
3419 if (RawValue.has_value()) {
3420 EVT VT = In.getValueType().getScalarType();
3421 if (VT.getSimpleVT() == MVT::f16 || VT.getSimpleVT() == MVT::bf16) {
3422 APFloat FloatVal(VT.getSimpleVT() == MVT::f16
3425 RawValue.value());
3426 if (TII->isInlineConstant(FloatVal)) {
3427 Src = CurDAG->getTargetConstant(RawValue.value(), SDLoc(In),
3428 MVT::i16);
3429 return true;
3430 }
3431 } else if (VT.getSimpleVT() == MVT::i16) {
3432 if (TII->isInlineConstant(RawValue.value())) {
3433 Src = CurDAG->getTargetConstant(RawValue.value(), SDLoc(In),
3434 MVT::i16);
3435 return true;
3436 }
3437 } else
3438 llvm_unreachable("unknown 16-bit type");
3439 }
3440 }
3441 }
3442
3443 return false;
3444}
3445
3446bool AMDGPUDAGToDAGISel::SelectSWMMACIndex8(SDValue In, SDValue &Src,
3447 SDValue &IndexKey) const {
3448 unsigned Key = 0;
3449 Src = In;
3450
3451 if (In.getOpcode() == ISD::SRL) {
3452 const llvm::SDValue &ShiftSrc = In.getOperand(0);
3453 ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(In.getOperand(1));
3454 if (ShiftSrc.getValueType().getSizeInBits() == 32 && ShiftAmt &&
3455 ShiftAmt->getZExtValue() % 8 == 0) {
3456 Key = ShiftAmt->getZExtValue() / 8;
3457 Src = ShiftSrc;
3458 }
3459 }
3460
3461 IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);
3462 return true;
3463}
3464
3465bool AMDGPUDAGToDAGISel::SelectSWMMACIndex16(SDValue In, SDValue &Src,
3466 SDValue &IndexKey) const {
3467 unsigned Key = 0;
3468 Src = In;
3469
3470 if (In.getOpcode() == ISD::SRL) {
3471 const llvm::SDValue &ShiftSrc = In.getOperand(0);
3472 ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(In.getOperand(1));
3473 if (ShiftSrc.getValueType().getSizeInBits() == 32 && ShiftAmt &&
3474 ShiftAmt->getZExtValue() == 16) {
3475 Key = 1;
3476 Src = ShiftSrc;
3477 }
3478 }
3479
3480 IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);
3481 return true;
3482}
3483
3484bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src,
3485 SDValue &SrcMods) const {
3486 Src = In;
3487 // FIXME: Handle op_sel
3488 SrcMods = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32);
3489 return true;
3490}
3491
3492bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods(SDValue In, SDValue &Src,
3493 SDValue &SrcMods) const {
3494 // FIXME: Handle op_sel
3495 return SelectVOP3Mods(In, Src, SrcMods);
3496}
3497
3498// The return value is not whether the match is possible (which it always is),
3499// but whether or not it a conversion is really used.
3500bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src,
3501 unsigned &Mods) const {
3502 Mods = 0;
3503 SelectVOP3ModsImpl(In, Src, Mods);
3504
3505 if (Src.getOpcode() == ISD::FP_EXTEND) {
3506 Src = Src.getOperand(0);
3507 assert(Src.getValueType() == MVT::f16);
3508 Src = stripBitcast(Src);
3509
3510 // Be careful about folding modifiers if we already have an abs. fneg is
3511 // applied last, so we don't want to apply an earlier fneg.
3512 if ((Mods & SISrcMods::ABS) == 0) {
3513 unsigned ModsTmp;
3514 SelectVOP3ModsImpl(Src, Src, ModsTmp);
3515
3516 if ((ModsTmp & SISrcMods::NEG) != 0)
3517 Mods ^= SISrcMods::NEG;
3518
3519 if ((ModsTmp & SISrcMods::ABS) != 0)
3520 Mods |= SISrcMods::ABS;
3521 }
3522
3523 // op_sel/op_sel_hi decide the source type and source.
3524 // If the source's op_sel_hi is set, it indicates to do a conversion from fp16.
3525 // If the sources's op_sel is set, it picks the high half of the source
3526 // register.
3527
3528 Mods |= SISrcMods::OP_SEL_1;
3529 if (isExtractHiElt(Src, Src)) {
3530 Mods |= SISrcMods::OP_SEL_0;
3531
3532 // TODO: Should we try to look for neg/abs here?
3533 }
3534
3535 return true;
3536 }
3537
3538 return false;
3539}
3540
3541bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsExt(SDValue In, SDValue &Src,
3542 SDValue &SrcMods) const {
3543 unsigned Mods = 0;
3544 if (!SelectVOP3PMadMixModsImpl(In, Src, Mods))
3545 return false;
3546 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3547 return true;
3548}
3549
3550bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src,
3551 SDValue &SrcMods) const {
3552 unsigned Mods = 0;
3553 SelectVOP3PMadMixModsImpl(In, Src, Mods);
3554 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3555 return true;
3556}
3557
3558// Match BITOP3 operation and return a number of matched instructions plus
3559// truth table.
3560static std::pair<unsigned, uint8_t> BitOp3_Op(SDValue In,
3562 unsigned NumOpcodes = 0;
3563 uint8_t LHSBits, RHSBits;
3564
3565 auto getOperandBits = [&Src, In](SDValue Op, uint8_t &Bits) -> bool {
3566 // Define truth table given Src0, Src1, Src2 bits permutations:
3567 // 0 0 0
3568 // 0 0 1
3569 // 0 1 0
3570 // 0 1 1
3571 // 1 0 0
3572 // 1 0 1
3573 // 1 1 0
3574 // 1 1 1
3575 const uint8_t SrcBits[3] = { 0xf0, 0xcc, 0xaa };
3576
3577 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
3578 if (C->isAllOnes()) {
3579 Bits = 0xff;
3580 return true;
3581 }
3582 if (C->isZero()) {
3583 Bits = 0;
3584 return true;
3585 }
3586 }
3587
3588 for (unsigned I = 0; I < Src.size(); ++I) {
3589 // Try to find existing reused operand
3590 if (Src[I] == Op) {
3591 Bits = SrcBits[I];
3592 return true;
3593 }
3594 // Try to replace parent operator
3595 if (Src[I] == In) {
3596 Bits = SrcBits[I];
3597 Src[I] = Op;
3598 return true;
3599 }
3600 }
3601
3602 if (Src.size() == 3) {
3603 // No room left for operands. Try one last time, there can be a 'not' of
3604 // one of our source operands. In this case we can compute the bits
3605 // without growing Src vector.
3606 if (Op.getOpcode() == ISD::XOR) {
3607 if (auto *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
3608 if (C->isAllOnes()) {
3609 SDValue LHS = Op.getOperand(0);
3610 for (unsigned I = 0; I < Src.size(); ++I) {
3611 if (Src[I] == LHS) {
3612 Bits = ~SrcBits[I];
3613 return true;
3614 }
3615 }
3616 }
3617 }
3618 }
3619
3620 return false;
3621 }
3622
3623 Bits = SrcBits[Src.size()];
3624 Src.push_back(Op);
3625 return true;
3626 };
3627
3628 switch (In.getOpcode()) {
3629 case ISD::AND:
3630 case ISD::OR:
3631 case ISD::XOR: {
3632 SDValue LHS = In.getOperand(0);
3633 SDValue RHS = In.getOperand(1);
3634
3635 SmallVector<SDValue, 3> Backup(Src.begin(), Src.end());
3636 if (!getOperandBits(LHS, LHSBits) ||
3637 !getOperandBits(RHS, RHSBits)) {
3638 Src = Backup;
3639 return std::make_pair(0, 0);
3640 }
3641
3642 // Recursion is naturally limited by the size of the operand vector.
3643 auto Op = BitOp3_Op(LHS, Src);
3644 if (Op.first) {
3645 NumOpcodes += Op.first;
3646 LHSBits = Op.second;
3647 }
3648
3649 Op = BitOp3_Op(RHS, Src);
3650 if (Op.first) {
3651 NumOpcodes += Op.first;
3652 RHSBits = Op.second;
3653 }
3654 break;
3655 }
3656 default:
3657 return std::make_pair(0, 0);
3658 }
3659
3660 uint8_t TTbl;
3661 switch (In.getOpcode()) {
3662 case ISD::AND:
3663 TTbl = LHSBits & RHSBits;
3664 break;
3665 case ISD::OR:
3666 TTbl = LHSBits | RHSBits;
3667 break;
3668 case ISD::XOR:
3669 TTbl = LHSBits ^ RHSBits;
3670 break;
3671 default:
3672 break;
3673 }
3674
3675 return std::make_pair(NumOpcodes + 1, TTbl);
3676}
3677
3678bool AMDGPUDAGToDAGISel::SelectBITOP3(SDValue In, SDValue &Src0, SDValue &Src1,
3679 SDValue &Src2, SDValue &Tbl) const {
3681 uint8_t TTbl;
3682 unsigned NumOpcodes;
3683
3684 std::tie(NumOpcodes, TTbl) = BitOp3_Op(In, Src);
3685
3686 // Src.empty() case can happen if all operands are all zero or all ones.
3687 // Normally it shall be optimized out before reaching this.
3688 if (NumOpcodes < 2 || Src.empty())
3689 return false;
3690
3691 // For a uniform case threshold should be higher to account for moves between
3692 // VGPRs and SGPRs. It needs one operand in a VGPR, rest two can be in SGPRs
3693 // and a readtfirstlane after.
3694 if (NumOpcodes < 4 && !In->isDivergent())
3695 return false;
3696
3697 if (NumOpcodes == 2 && In.getValueType() == MVT::i32) {
3698 // Avoid using BITOP3 for OR3, XOR3, AND_OR. This is not faster but makes
3699 // asm more readable. This cannot be modeled with AddedComplexity because
3700 // selector does not know how many operations did we match.
3701 if ((In.getOpcode() == ISD::XOR || In.getOpcode() == ISD::OR) &&
3702 (In.getOperand(0).getOpcode() == In.getOpcode() ||
3703 In.getOperand(1).getOpcode() == In.getOpcode()))
3704 return false;
3705
3706 if (In.getOpcode() == ISD::OR &&
3707 (In.getOperand(0).getOpcode() == ISD::AND ||
3708 In.getOperand(1).getOpcode() == ISD::AND))
3709 return false;
3710 }
3711
3712 // Last operand can be ignored, turning a ternary operation into a binary.
3713 // For example: (~a & b & c) | (~a & b & ~c) -> (~a & b). We can replace
3714 // 'c' with 'a' here without changing the answer. In some pathological
3715 // cases it should be possible to get an operation with a single operand
3716 // too if optimizer would not catch it.
3717 while (Src.size() < 3)
3718 Src.push_back(Src[0]);
3719
3720 Src0 = Src[0];
3721 Src1 = Src[1];
3722 Src2 = Src[2];
3723
3724 Tbl = CurDAG->getTargetConstant(TTbl, SDLoc(In), MVT::i32);
3725 return true;
3726}
3727
3728SDValue AMDGPUDAGToDAGISel::getHi16Elt(SDValue In) const {
3729 if (In.isUndef())
3730 return CurDAG->getUNDEF(MVT::i32);
3731
3732 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(In)) {
3733 SDLoc SL(In);
3734 return CurDAG->getConstant(C->getZExtValue() << 16, SL, MVT::i32);
3735 }
3736
3737 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(In)) {
3738 SDLoc SL(In);
3739 return CurDAG->getConstant(
3740 C->getValueAPF().bitcastToAPInt().getZExtValue() << 16, SL, MVT::i32);
3741 }
3742
3743 SDValue Src;
3744 if (isExtractHiElt(In, Src))
3745 return Src;
3746
3747 return SDValue();
3748}
3749
3750bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
3752
3753 const SIRegisterInfo *SIRI =
3754 static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
3755 const SIInstrInfo * SII =
3756 static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
3757
3758 unsigned Limit = 0;
3759 bool AllUsesAcceptSReg = true;
3760 for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
3761 Limit < 10 && U != E; ++U, ++Limit) {
3762 const TargetRegisterClass *RC =
3763 getOperandRegClass(U->getUser(), U->getOperandNo());
3764
3765 // If the register class is unknown, it could be an unknown
3766 // register class that needs to be an SGPR, e.g. an inline asm
3767 // constraint
3768 if (!RC || SIRI->isSGPRClass(RC))
3769 return false;
3770
3771 if (RC != &AMDGPU::VS_32RegClass && RC != &AMDGPU::VS_64RegClass) {
3772 AllUsesAcceptSReg = false;
3773 SDNode *User = U->getUser();
3774 if (User->isMachineOpcode()) {
3775 unsigned Opc = User->getMachineOpcode();
3776 const MCInstrDesc &Desc = SII->get(Opc);
3777 if (Desc.isCommutable()) {
3778 unsigned OpIdx = Desc.getNumDefs() + U->getOperandNo();
3779 unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex;
3780 if (SII->findCommutedOpIndices(Desc, OpIdx, CommuteIdx1)) {
3781 unsigned CommutedOpNo = CommuteIdx1 - Desc.getNumDefs();
3782 const TargetRegisterClass *CommutedRC =
3783 getOperandRegClass(U->getUser(), CommutedOpNo);
3784 if (CommutedRC == &AMDGPU::VS_32RegClass ||
3785 CommutedRC == &AMDGPU::VS_64RegClass)
3786 AllUsesAcceptSReg = true;
3787 }
3788 }
3789 }
3790 // If "AllUsesAcceptSReg == false" so far we haven't succeeded
3791 // commuting current user. This means have at least one use
3792 // that strictly require VGPR. Thus, we will not attempt to commute
3793 // other user instructions.
3794 if (!AllUsesAcceptSReg)
3795 break;
3796 }
3797 }
3798 return !AllUsesAcceptSReg && (Limit < 10);
3799}
3800
3801bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode *N) const {
3802 const auto *Ld = cast<LoadSDNode>(N);
3803
3804 const MachineMemOperand *MMO = Ld->getMemOperand();
3805 if (N->isDivergent() && !AMDGPUInstrInfo::isUniformMMO(MMO))
3806 return false;
3807
3808 return MMO->getSize().hasValue() &&
3809 Ld->getAlign() >=
3810 Align(std::min(MMO->getSize().getValue().getKnownMinValue(),
3811 uint64_t(4))) &&
3812 ((Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
3813 Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ||
3814 (Subtarget->getScalarizeGlobalBehavior() &&
3815 Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
3816 Ld->isSimple() &&
3817 static_cast<const SITargetLowering *>(getTargetLowering())
3818 ->isMemOpHasNoClobberedMemOperand(N)));
3819}
3820
3823 *static_cast<const AMDGPUTargetLowering*>(getTargetLowering());
3824 bool IsModified = false;
3825 do {
3826 IsModified = false;
3827
3828 // Go over all selected nodes and try to fold them a bit more
3830 while (Position != CurDAG->allnodes_end()) {
3831 SDNode *Node = &*Position++;
3832 MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(Node);
3833 if (!MachineNode)
3834 continue;
3835
3836 SDNode *ResNode = Lowering.PostISelFolding(MachineNode, *CurDAG);
3837 if (ResNode != Node) {
3838 if (ResNode)
3839 ReplaceUses(Node, ResNode);
3840 IsModified = true;
3841 }
3842 }
3844 } while (IsModified);
3845}
3846
3848 CodeGenOptLevel OptLevel)
3850 ID, std::make_unique<AMDGPUDAGToDAGISel>(TM, OptLevel)) {}
3851
unsigned const MachineRegisterInfo * MRI
static bool getBaseWithOffsetUsingSplitOR(SelectionDAG &DAG, SDValue Addr, SDValue &N0, SDValue &N1)
static MachineSDNode * buildRegSequence32(SmallVectorImpl< SDValue > &Elts, llvm::SelectionDAG *CurDAG, const SDLoc &DL)
static SDValue matchZExtFromI32(SDValue Op)
static SDValue SelectSAddrFI(SelectionDAG *CurDAG, SDValue SAddr)
static MemSDNode * findMemSDNode(SDNode *N)
static MachineSDNode * buildRegSequence16(SmallVectorImpl< SDValue > &Elts, llvm::SelectionDAG *CurDAG, const SDLoc &DL)
static bool IsCopyFromSGPR(const SIRegisterInfo &TRI, SDValue Val)
static SDValue combineBallotPattern(SDValue VCMP, bool &Negate)
static void checkWMMAElementsModifiersF16(BuildVectorSDNode *BV, std::function< bool(SDValue)> ModifierCheck)
Defines an instruction selector for the AMDGPU target.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In, Register &Out)
static std::pair< unsigned, uint8_t > BitOp3_Op(Register R, SmallVectorImpl< Register > &Src, const MachineRegisterInfo &MRI)
static unsigned gwsIntrinToOpcode(unsigned IntrID)
static Register buildRegSequence(SmallVectorImpl< Register > &Elts, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods, SmallVectorImpl< Register > &Elts, Register &Src, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
Provides AMDGPU specific target descriptions.
Base class for AMDGPU specific classes of TargetSubtarget.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(...)
Definition: Debug.h:106
uint64_t Addr
uint64_t Size
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
unsigned const TargetRegisterInfo * TRI
FunctionAnalysisManager FAM
if(PassOpts->AAPipeline)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:57
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
Provides R600 specific target descriptions.
Interface definition for R600RegisterInfo.
const SmallVectorImpl< MachineOperand > & Cond
SI DAG Lowering interface definition.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
LLVM IR instance of the generic uniformity analysis.
Value * RHS
Value * LHS
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
AMDGPUDAGToDAGISelLegacy(TargetMachine &TM, CodeGenOptLevel OptLevel)
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
StringRef getPassName() const override
getPassName - Return a nice clean name for a pass.
AMDGPU specific code to select AMDGPU machine instructions for SelectionDAG operations.
void SelectBuildVector(SDNode *N, unsigned RegClassID)
void Select(SDNode *N) override
Main hook for targets to transform nodes into machine nodes.
bool runOnMachineFunction(MachineFunction &MF) override
void PreprocessISelDAG() override
PreprocessISelDAG - This hook allows targets to hack on the graph before instruction selection starts...
void PostprocessISelDAG() override
PostprocessISelDAG() - This hook allows the target to hack on the graph right after selection.
bool matchLoadD16FromBuildVector(SDNode *N) const
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
AMDGPUISelDAGToDAGPass(TargetMachine &TM)
static bool isUniformMMO(const MachineMemOperand *MMO)
unsigned getWavefrontSizeLog2() const
unsigned getWavefrontSize() const
bool hasInv2PiInlineImm() const
static SDValue stripBitcast(SDValue Val)
static int64_t getNullPointerValue(unsigned AddrSpace)
Get the integer value of a null pointer in the given address space.
Class for arbitrary precision integers.
Definition: APInt.h:78
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1520
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1542
unsigned countr_one() const
Count the number of trailing one bits.
Definition: APInt.h:1635
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:410
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:239
A "pseudo-class" with methods for operating on BUILD_VECTORs.
SDValue getSplatValue(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted value or a null value if this is not a splat.
uint64_t getZExtValue() const
int64_t getSExtValue() const
This class represents an Operation in the Expression.
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:279
Legacy analysis pass which computes a DominatorTree.
Definition: Dominators.h:317
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:310
bool hasPermlane32Swap() const
bool hasScalarCompareEq64() const
int getLDSBankCount() const
Definition: GCNSubtarget.h:350
bool hasUsableDSOffset() const
True if the offset field of DS instructions works as expected.
Definition: GCNSubtarget.h:478
bool unsafeDSOffsetFoldingEnabled() const
Definition: GCNSubtarget.h:482
bool hasFlatInstOffsets() const
Definition: GCNSubtarget.h:641
bool hasDLInsts() const
Definition: GCNSubtarget.h:779
const SIInstrInfo * getInstrInfo() const override
Definition: GCNSubtarget.h:279
unsigned getConstantBusLimit(unsigned Opcode) const
bool hasMADIntraFwdBug() const
bool privateMemoryResourceIsRangeChecked() const
Definition: GCNSubtarget.h:563
bool hasSignedScratchOffsets() const
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:291
bool hasDOTOpSelHazard() const
bool d16PreservesUnusedBits() const
Definition: GCNSubtarget.h:706
bool hasRestrictedSOffset() const
bool hasFlatSegmentOffsetBug() const
Definition: GCNSubtarget.h:694
bool getScalarizeGlobalBehavior() const
Definition: GCNSubtarget.h:988
bool ldsRequiresM0Init() const
Return if most LDS instructions have an m0 use that require m0 to be initialized.
Definition: GCNSubtarget.h:716
bool isWave32() const
bool hasPermlane16Swap() const
bool hasFlatScratchSVSSwizzleBug() const
bool hasGWS() const
bool useFlatForGlobal() const
Definition: GCNSubtarget.h:541
Generation getGeneration() const
Definition: GCNSubtarget.h:327
bool hasGWSSemaReleaseAll() const
Definition: GCNSubtarget.h:730
bool hasAddr64() const
Definition: GCNSubtarget.h:391
bool hasAddNoCarry() const
Definition: GCNSubtarget.h:738
bool hasSALUFloatInsts() const
void checkSubtargetFeatures(const Function &F) const
Diagnose inconsistent subtarget features before attempting to codegen function F.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
bool hasValue() const
TypeSize getValue() const
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:566
SmallVector< LoopT *, 4 > getLoopsInPreorder() const
Return all of the loops in the function in preorder across the loop nests, with siblings in forward p...
The legacy pass manager's analysis pass to compute loop information.
Definition: LoopInfo.h:593
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
const Triple & getTargetTriple() const
Machine Value Type.
static MVT getIntegerVT(unsigned BitWidth)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
An SDNode that represents everything that will be needed to construct a MachineInstr.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getChain() const
EVT getMemoryVT() const
Return the type of the in-memory value.
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:111
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
This class provides iterator support for SDUse operands that use a specific SDNode.
Represents one node in the SelectionDAG.
const APInt & getAsAPIntVal() const
Helper method returns the APInt value of a ConstantSDNode.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool isDivergent() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumOperands() const
Return the number of values used by this operation.
const SDValue & getOperand(unsigned Num) const
bool isPredecessorOf(const SDNode *N) const
Return true if this node is a predecessor of N.
static use_iterator use_end()
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
const SDValue & getOperand(unsigned i) const
uint64_t getConstantOperandVal(unsigned i) const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx0, unsigned &SrcOpIdx1) const override
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
const TargetRegisterClass * getRegClass(unsigned RCID) const
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isSGPRClass(const TargetRegisterClass *RC)
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
SelectionDAGISel - This is the common base class used for SelectionDAG-based pattern-matching instruc...
std::unique_ptr< FunctionLoweringInfo > FuncInfo
const TargetLowering * TLI
MachineFunction * MF
const TargetInstrInfo * TII
void ReplaceUses(SDValue F, SDValue T)
ReplaceUses - replace all uses of the old node F with the use of the new node T.
void ReplaceNode(SDNode *F, SDNode *T)
Replace all uses of F with T, then remove F from the DAG.
virtual bool runOnMachineFunction(MachineFunction &mf)
const TargetLowering * getTargetLowering() const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:228
const TargetSubtargetInfo & getSubtarget() const
Definition: SelectionDAG.h:499
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
Definition: SelectionDAG.h:801
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
SDNode * SelectNodeTo(SDNode *N, unsigned MachineOpc, EVT VT)
These are used for target selectors to mutate the specified node to have the specified return type,...
SDValue getRegister(Register Reg, EVT VT)
SDNode * MorphNodeTo(SDNode *N, unsigned Opc, SDVTList VTs, ArrayRef< SDValue > Ops)
This mutates the specified node to have the specified return type, opcode, and operands.
allnodes_const_iterator allnodes_begin() const
Definition: SelectionDAG.h:557
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
allnodes_const_iterator allnodes_end() const
Definition: SelectionDAG.h:558
void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
SDValue getTargetFrameIndex(int FI, EVT VT)
Definition: SelectionDAG.h:755
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:712
bool SignBitIsZero(SDValue Op, unsigned Depth=0) const
Return true if the sign bit of Op is known to be zero.
void RemoveDeadNodes()
This method deletes all unreachable nodes in the SelectionDAG.
void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
const TargetMachine & getTarget() const
Definition: SelectionDAG.h:498
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:700
bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:492
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition: SelectionDAG.h:580
ilist< SDNode >::iterator allnodes_iterator
Definition: SelectionDAG.h:560
bool empty() const
Definition: SmallVector.h:81
size_t size() const
Definition: SmallVector.h:78
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
static const unsigned CommuteAnyOperandIndex
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
const Triple & getTargetTriple() const
unsigned getID() const
Return the register class ID number.
ArchType getArch() const
Get the parsed architecture type of this triple.
Definition: Triple.h:395
Legacy analysis pass which computes a CycleInfo.
LLVM Value Representation.
Definition: Value.h:74
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:168
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
@ CLAMP
CLAMP value between 0.0 and 1.0.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
Key
PAL metadata keys.
std::optional< int64_t > getSMRDEncodedLiteralOffset32(const MCSubtargetInfo &ST, int64_t ByteOffset)
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isValid32BitLiteral(uint64_t Val, bool IsFP64)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST)
std::optional< int64_t > getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset, bool IsBuffer, bool HasSOffset)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:125
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:780
@ STACKRESTORE
STACKRESTORE has two operands, an input chain and a pointer to restore to it returns an output chain.
Definition: ISDOpcodes.h:1197
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:257
@ ConstantFP
Definition: ISDOpcodes.h:77
@ ADDC
Carry-setting nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:276
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition: ISDOpcodes.h:502
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:246
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1102
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:498
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition: ISDOpcodes.h:205
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:841
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:397
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:954
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition: ISDOpcodes.h:236
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
Definition: ISDOpcodes.h:997
@ CONVERGENCECTRL_GLUE
Definition: ISDOpcodes.h:1476
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition: ISDOpcodes.h:635
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:981
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition: ISDOpcodes.h:515
@ UNDEF
UNDEF - An undefined node.
Definition: ISDOpcodes.h:218
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition: ISDOpcodes.h:215
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:735
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:550
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:209
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:811
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:1031
@ TargetFrameIndex
Definition: ISDOpcodes.h:172
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:849
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:939
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:310
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:709
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:190
@ ADDE
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:286
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:920
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:817
@ BRCOND
BRCOND - Conditional branch.
Definition: ISDOpcodes.h:1141
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:198
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition: ISDOpcodes.h:530
bool isExtOpcode(unsigned Opcode)
Definition: ISDOpcodes.h:1681
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1610
@ Undef
Value of the register doesn't matter.
constexpr const char32_t SBase
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Offset
Definition: DWP.cpp:480
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
constexpr bool isMask_32(uint32_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition: MathExtras.h:269
bool isBoolSGPR(SDValue V)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:156
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
CodeGenOptLevel
Code generation optimization level.
Definition: CodeGen.h:54
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:161
static SDNode * packConstantV2I16(const SDNode *N, SelectionDAG &DAG)
FunctionPass * createAMDGPUISelDag(TargetMachine &TM, CodeGenOptLevel OptLevel)
This pass converts a legalized DAG into a AMDGPU-specific.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ Add
Sum of integers.
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition: VE.h:375
Implement std::hash so that hash_code can be used in STL containers.
Definition: BitVector.h:858
#define N
static const fltSemantics & IEEEhalf() LLVM_READNONE
Definition: APFloat.cpp:255
static const fltSemantics & BFloat() LLVM_READNONE
Definition: APFloat.cpp:256
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Description of the encoding of one expression Op.
Extended Value Type.
Definition: ValueTypes.h:35
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:368
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:380
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:311
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition: ValueTypes.h:251
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:323
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:157
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:331
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition: KnownBits.h:293
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition: KnownBits.h:336
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition: KnownBits.h:137
APInt getMinValue() const
Return the minimal unsigned value possible given these KnownBits.
Definition: KnownBits.h:121
static unsigned getSubRegFromChannel(unsigned Channel)
This represents a list of ValueType's that has been intern'd by a SelectionDAG.