LLVM 20.0.0git
AMDGPUISelDAGToDAG.cpp
Go to the documentation of this file.
1//===-- AMDGPUISelDAGToDAG.cpp - A dag to dag inst selector for AMDGPU ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//==-----------------------------------------------------------------------===//
8//
9/// \file
10/// Defines an instruction selector for the AMDGPU target.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPUISelDAGToDAG.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPUSubtarget.h"
18#include "AMDGPUTargetMachine.h"
21#include "R600RegisterInfo.h"
22#include "SIISelLowering.h"
29#include "llvm/IR/IntrinsicsAMDGPU.h"
31
32#ifdef EXPENSIVE_CHECKS
34#include "llvm/IR/Dominators.h"
35#endif
36
37#define DEBUG_TYPE "amdgpu-isel"
38
39using namespace llvm;
40
41//===----------------------------------------------------------------------===//
42// Instruction Selector Implementation
43//===----------------------------------------------------------------------===//
44
45namespace {
46static SDValue stripBitcast(SDValue Val) {
47 return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val;
48}
49
50// Figure out if this is really an extract of the high 16-bits of a dword.
51static bool isExtractHiElt(SDValue In, SDValue &Out) {
52 In = stripBitcast(In);
53
54 if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
55 if (ConstantSDNode *Idx = dyn_cast<ConstantSDNode>(In.getOperand(1))) {
56 if (!Idx->isOne())
57 return false;
58 Out = In.getOperand(0);
59 return true;
60 }
61 }
62
63 if (In.getOpcode() != ISD::TRUNCATE)
64 return false;
65
66 SDValue Srl = In.getOperand(0);
67 if (Srl.getOpcode() == ISD::SRL) {
68 if (ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
69 if (ShiftAmt->getZExtValue() == 16) {
70 Out = stripBitcast(Srl.getOperand(0));
71 return true;
72 }
73 }
74 }
75
76 return false;
77}
78
79// Look through operations that obscure just looking at the low 16-bits of the
80// same register.
81static SDValue stripExtractLoElt(SDValue In) {
82 if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
83 SDValue Idx = In.getOperand(1);
84 if (isNullConstant(Idx) && In.getValueSizeInBits() <= 32)
85 return In.getOperand(0);
86 }
87
88 if (In.getOpcode() == ISD::TRUNCATE) {
89 SDValue Src = In.getOperand(0);
90 if (Src.getValueType().getSizeInBits() == 32)
91 return stripBitcast(Src);
92 }
93
94 return In;
95}
96
97} // end anonymous namespace
98
100 "AMDGPU DAG->DAG Pattern Instruction Selection", false,
101 false)
103INITIALIZE_PASS_DEPENDENCY(AMDGPUPerfHintAnalysisLegacy)
105#ifdef EXPENSIVE_CHECKS
108#endif
110 "AMDGPU DAG->DAG Pattern Instruction Selection", false,
111 false)
112
113/// This pass converts a legalized DAG into a AMDGPU-specific
114// DAG, ready for instruction scheduling.
116 CodeGenOptLevel OptLevel) {
117 return new AMDGPUDAGToDAGISelLegacy(TM, OptLevel);
118}
119
121 CodeGenOptLevel OptLevel)
122 : SelectionDAGISel(TM, OptLevel) {}
123
125 Subtarget = &MF.getSubtarget<GCNSubtarget>();
127 Mode = SIModeRegisterDefaults(MF.getFunction(), *Subtarget);
129}
130
131bool AMDGPUDAGToDAGISel::fp16SrcZerosHighBits(unsigned Opc) const {
132 // XXX - only need to list legal operations.
133 switch (Opc) {
134 case ISD::FADD:
135 case ISD::FSUB:
136 case ISD::FMUL:
137 case ISD::FDIV:
138 case ISD::FREM:
140 case ISD::UINT_TO_FP:
141 case ISD::SINT_TO_FP:
142 case ISD::FABS:
143 // Fabs is lowered to a bit operation, but it's an and which will clear the
144 // high bits anyway.
145 case ISD::FSQRT:
146 case ISD::FSIN:
147 case ISD::FCOS:
148 case ISD::FPOWI:
149 case ISD::FPOW:
150 case ISD::FLOG:
151 case ISD::FLOG2:
152 case ISD::FLOG10:
153 case ISD::FEXP:
154 case ISD::FEXP2:
155 case ISD::FCEIL:
156 case ISD::FTRUNC:
157 case ISD::FRINT:
158 case ISD::FNEARBYINT:
159 case ISD::FROUNDEVEN:
160 case ISD::FROUND:
161 case ISD::FFLOOR:
162 case ISD::FMINNUM:
163 case ISD::FMAXNUM:
164 case ISD::FLDEXP:
165 case AMDGPUISD::FRACT:
166 case AMDGPUISD::CLAMP:
169 case AMDGPUISD::FMIN3:
170 case AMDGPUISD::FMAX3:
171 case AMDGPUISD::FMED3:
173 case AMDGPUISD::RCP:
174 case AMDGPUISD::RSQ:
176 // On gfx10, all 16-bit instructions preserve the high bits.
177 return Subtarget->getGeneration() <= AMDGPUSubtarget::GFX9;
178 case ISD::FP_ROUND:
179 // We may select fptrunc (fma/mad) to mad_mixlo, which does not zero the
180 // high bits on gfx9.
181 // TODO: If we had the source node we could see if the source was fma/mad
183 case ISD::FMA:
184 case ISD::FMAD:
187 default:
188 // fcopysign, select and others may be lowered to 32-bit bit operations
189 // which don't zero the high bits.
190 return false;
191 }
192}
193
195#ifdef EXPENSIVE_CHECKS
196 DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
197 LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
198 for (auto &L : LI->getLoopsInPreorder()) {
199 assert(L->isLCSSAForm(DT));
200 }
201#endif
203}
204
208#ifdef EXPENSIVE_CHECKS
211#endif
213}
214
216 assert(Subtarget->d16PreservesUnusedBits());
217 MVT VT = N->getValueType(0).getSimpleVT();
218 if (VT != MVT::v2i16 && VT != MVT::v2f16)
219 return false;
220
221 SDValue Lo = N->getOperand(0);
222 SDValue Hi = N->getOperand(1);
223
224 LoadSDNode *LdHi = dyn_cast<LoadSDNode>(stripBitcast(Hi));
225
226 // build_vector lo, (load ptr) -> load_d16_hi ptr, lo
227 // build_vector lo, (zextload ptr from i8) -> load_d16_hi_u8 ptr, lo
228 // build_vector lo, (sextload ptr from i8) -> load_d16_hi_i8 ptr, lo
229
230 // Need to check for possible indirect dependencies on the other half of the
231 // vector to avoid introducing a cycle.
232 if (LdHi && Hi.hasOneUse() && !LdHi->isPredecessorOf(Lo.getNode())) {
233 SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
234
236 SDValue Ops[] = {
237 LdHi->getChain(), LdHi->getBasePtr(), TiedIn
238 };
239
240 unsigned LoadOp = AMDGPUISD::LOAD_D16_HI;
241 if (LdHi->getMemoryVT() == MVT::i8) {
242 LoadOp = LdHi->getExtensionType() == ISD::SEXTLOAD ?
244 } else {
245 assert(LdHi->getMemoryVT() == MVT::i16);
246 }
247
248 SDValue NewLoadHi =
249 CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdHi), VTList,
250 Ops, LdHi->getMemoryVT(),
251 LdHi->getMemOperand());
252
253 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadHi);
254 CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdHi, 1), NewLoadHi.getValue(1));
255 return true;
256 }
257
258 // build_vector (load ptr), hi -> load_d16_lo ptr, hi
259 // build_vector (zextload ptr from i8), hi -> load_d16_lo_u8 ptr, hi
260 // build_vector (sextload ptr from i8), hi -> load_d16_lo_i8 ptr, hi
261 LoadSDNode *LdLo = dyn_cast<LoadSDNode>(stripBitcast(Lo));
262 if (LdLo && Lo.hasOneUse()) {
263 SDValue TiedIn = getHi16Elt(Hi);
264 if (!TiedIn || LdLo->isPredecessorOf(TiedIn.getNode()))
265 return false;
266
267 SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
268 unsigned LoadOp = AMDGPUISD::LOAD_D16_LO;
269 if (LdLo->getMemoryVT() == MVT::i8) {
270 LoadOp = LdLo->getExtensionType() == ISD::SEXTLOAD ?
272 } else {
273 assert(LdLo->getMemoryVT() == MVT::i16);
274 }
275
276 TiedIn = CurDAG->getNode(ISD::BITCAST, SDLoc(N), VT, TiedIn);
277
278 SDValue Ops[] = {
279 LdLo->getChain(), LdLo->getBasePtr(), TiedIn
280 };
281
282 SDValue NewLoadLo =
283 CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdLo), VTList,
284 Ops, LdLo->getMemoryVT(),
285 LdLo->getMemOperand());
286
287 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadLo);
288 CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdLo, 1), NewLoadLo.getValue(1));
289 return true;
290 }
291
292 return false;
293}
294
296 if (!Subtarget->d16PreservesUnusedBits())
297 return;
298
300
301 bool MadeChange = false;
302 while (Position != CurDAG->allnodes_begin()) {
303 SDNode *N = &*--Position;
304 if (N->use_empty())
305 continue;
306
307 switch (N->getOpcode()) {
309 // TODO: Match load d16 from shl (extload:i16), 16
310 MadeChange |= matchLoadD16FromBuildVector(N);
311 break;
312 default:
313 break;
314 }
315 }
316
317 if (MadeChange) {
319 LLVM_DEBUG(dbgs() << "After PreProcess:\n";
320 CurDAG->dump(););
321 }
322}
323
324bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N) const {
325 if (N->isUndef())
326 return true;
327
328 const SIInstrInfo *TII = Subtarget->getInstrInfo();
329 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N))
330 return TII->isInlineConstant(C->getAPIntValue());
331
332 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N))
333 return TII->isInlineConstant(C->getValueAPF());
334
335 return false;
336}
337
338/// Determine the register class for \p OpNo
339/// \returns The register class of the virtual register that will be used for
340/// the given operand number \OpNo or NULL if the register class cannot be
341/// determined.
342const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
343 unsigned OpNo) const {
344 if (!N->isMachineOpcode()) {
345 if (N->getOpcode() == ISD::CopyToReg) {
346 Register Reg = cast<RegisterSDNode>(N->getOperand(1))->getReg();
347 if (Reg.isVirtual()) {
349 return MRI.getRegClass(Reg);
350 }
351
352 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
353 return TRI->getPhysRegBaseClass(Reg);
354 }
355
356 return nullptr;
357 }
358
359 switch (N->getMachineOpcode()) {
360 default: {
361 const MCInstrDesc &Desc =
362 Subtarget->getInstrInfo()->get(N->getMachineOpcode());
363 unsigned OpIdx = Desc.getNumDefs() + OpNo;
364 if (OpIdx >= Desc.getNumOperands())
365 return nullptr;
366 int RegClass = Desc.operands()[OpIdx].RegClass;
367 if (RegClass == -1)
368 return nullptr;
369
370 return Subtarget->getRegisterInfo()->getRegClass(RegClass);
371 }
372 case AMDGPU::REG_SEQUENCE: {
373 unsigned RCID = N->getConstantOperandVal(0);
374 const TargetRegisterClass *SuperRC =
375 Subtarget->getRegisterInfo()->getRegClass(RCID);
376
377 SDValue SubRegOp = N->getOperand(OpNo + 1);
378 unsigned SubRegIdx = SubRegOp->getAsZExtVal();
379 return Subtarget->getRegisterInfo()->getSubClassWithSubReg(SuperRC,
380 SubRegIdx);
381 }
382 }
383}
384
385SDNode *AMDGPUDAGToDAGISel::glueCopyToOp(SDNode *N, SDValue NewChain,
386 SDValue Glue) const {
387 SmallVector <SDValue, 8> Ops;
388 Ops.push_back(NewChain); // Replace the chain.
389 for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
390 Ops.push_back(N->getOperand(i));
391
392 Ops.push_back(Glue);
393 return CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops);
394}
395
396SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N, SDValue Val) const {
398 *static_cast<const SITargetLowering*>(getTargetLowering());
399
400 assert(N->getOperand(0).getValueType() == MVT::Other && "Expected chain");
401
402 SDValue M0 = Lowering.copyToM0(*CurDAG, N->getOperand(0), SDLoc(N), Val);
403 return glueCopyToOp(N, M0, M0.getValue(1));
404}
405
406SDNode *AMDGPUDAGToDAGISel::glueCopyToM0LDSInit(SDNode *N) const {
407 unsigned AS = cast<MemSDNode>(N)->getAddressSpace();
408 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
409 if (Subtarget->ldsRequiresM0Init())
410 return glueCopyToM0(
411 N, CurDAG->getSignedTargetConstant(-1, SDLoc(N), MVT::i32));
412 } else if (AS == AMDGPUAS::REGION_ADDRESS) {
414 unsigned Value = MF.getInfo<SIMachineFunctionInfo>()->getGDSSize();
415 return
416 glueCopyToM0(N, CurDAG->getTargetConstant(Value, SDLoc(N), MVT::i32));
417 }
418 return N;
419}
420
421MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm,
422 EVT VT) const {
424 AMDGPU::S_MOV_B32, DL, MVT::i32,
425 CurDAG->getTargetConstant(Lo_32(Imm), DL, MVT::i32));
427 AMDGPU::S_MOV_B32, DL, MVT::i32,
428 CurDAG->getTargetConstant(Hi_32(Imm), DL, MVT::i32));
429 const SDValue Ops[] = {
430 CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
431 SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
432 SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
433
434 return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, VT, Ops);
435}
436
437void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
438 EVT VT = N->getValueType(0);
439 unsigned NumVectorElts = VT.getVectorNumElements();
440 EVT EltVT = VT.getVectorElementType();
441 SDLoc DL(N);
442 SDValue RegClass = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
443
444 if (NumVectorElts == 1) {
445 CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT, N->getOperand(0),
446 RegClass);
447 return;
448 }
449
450 assert(NumVectorElts <= 32 && "Vectors with more than 32 elements not "
451 "supported yet");
452 // 32 = Max Num Vector Elements
453 // 2 = 2 REG_SEQUENCE operands per element (value, subreg index)
454 // 1 = Vector Register Class
455 SmallVector<SDValue, 32 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1);
456
457 bool IsGCN = CurDAG->getSubtarget().getTargetTriple().getArch() ==
459 RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
460 bool IsRegSeq = true;
461 unsigned NOps = N->getNumOperands();
462 for (unsigned i = 0; i < NOps; i++) {
463 // XXX: Why is this here?
464 if (isa<RegisterSDNode>(N->getOperand(i))) {
465 IsRegSeq = false;
466 break;
467 }
468 unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
470 RegSeqArgs[1 + (2 * i)] = N->getOperand(i);
471 RegSeqArgs[1 + (2 * i) + 1] = CurDAG->getTargetConstant(Sub, DL, MVT::i32);
472 }
473 if (NOps != NumVectorElts) {
474 // Fill in the missing undef elements if this was a scalar_to_vector.
475 assert(N->getOpcode() == ISD::SCALAR_TO_VECTOR && NOps < NumVectorElts);
476 MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
477 DL, EltVT);
478 for (unsigned i = NOps; i < NumVectorElts; ++i) {
479 unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
481 RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0);
482 RegSeqArgs[1 + (2 * i) + 1] =
483 CurDAG->getTargetConstant(Sub, DL, MVT::i32);
484 }
485 }
486
487 if (!IsRegSeq)
488 SelectCode(N);
489 CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), RegSeqArgs);
490}
491
493 unsigned int Opc = N->getOpcode();
494 if (N->isMachineOpcode()) {
495 N->setNodeId(-1);
496 return; // Already selected.
497 }
498
499 // isa<MemSDNode> almost works but is slightly too permissive for some DS
500 // intrinsics.
501 if (Opc == ISD::LOAD || Opc == ISD::STORE || isa<AtomicSDNode>(N)) {
502 N = glueCopyToM0LDSInit(N);
503 SelectCode(N);
504 return;
505 }
506
507 switch (Opc) {
508 default:
509 break;
510 // We are selecting i64 ADD here instead of custom lower it during
511 // DAG legalization, so we can fold some i64 ADDs used for address
512 // calculation into the LOAD and STORE instructions.
513 case ISD::ADDC:
514 case ISD::ADDE:
515 case ISD::SUBC:
516 case ISD::SUBE: {
517 if (N->getValueType(0) != MVT::i64)
518 break;
519
520 SelectADD_SUB_I64(N);
521 return;
522 }
523 case ISD::UADDO_CARRY:
524 case ISD::USUBO_CARRY:
525 if (N->getValueType(0) != MVT::i32)
526 break;
527
528 SelectAddcSubb(N);
529 return;
530 case ISD::UADDO:
531 case ISD::USUBO: {
532 SelectUADDO_USUBO(N);
533 return;
534 }
536 SelectFMUL_W_CHAIN(N);
537 return;
538 }
540 SelectFMA_W_CHAIN(N);
541 return;
542 }
543
545 case ISD::BUILD_VECTOR: {
546 EVT VT = N->getValueType(0);
547 unsigned NumVectorElts = VT.getVectorNumElements();
548 if (VT.getScalarSizeInBits() == 16) {
549 if (Opc == ISD::BUILD_VECTOR && NumVectorElts == 2) {
550 if (SDNode *Packed = packConstantV2I16(N, *CurDAG)) {
551 ReplaceNode(N, Packed);
552 return;
553 }
554 }
555
556 break;
557 }
558
559 assert(VT.getVectorElementType().bitsEq(MVT::i32));
560 unsigned RegClassID =
561 SIRegisterInfo::getSGPRClassForBitWidth(NumVectorElts * 32)->getID();
562 SelectBuildVector(N, RegClassID);
563 return;
564 }
565 case ISD::BUILD_PAIR: {
566 SDValue RC, SubReg0, SubReg1;
567 SDLoc DL(N);
568 if (N->getValueType(0) == MVT::i128) {
569 RC = CurDAG->getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32);
570 SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32);
571 SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32);
572 } else if (N->getValueType(0) == MVT::i64) {
573 RC = CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32);
574 SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
575 SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
576 } else {
577 llvm_unreachable("Unhandled value type for BUILD_PAIR");
578 }
579 const SDValue Ops[] = { RC, N->getOperand(0), SubReg0,
580 N->getOperand(1), SubReg1 };
581 ReplaceNode(N, CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL,
582 N->getValueType(0), Ops));
583 return;
584 }
585
586 case ISD::Constant:
587 case ISD::ConstantFP: {
588 if (N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N))
589 break;
590
591 uint64_t Imm;
592 if (ConstantFPSDNode *FP = dyn_cast<ConstantFPSDNode>(N)) {
593 Imm = FP->getValueAPF().bitcastToAPInt().getZExtValue();
594 if (AMDGPU::isValid32BitLiteral(Imm, true))
595 break;
596 } else {
597 ConstantSDNode *C = cast<ConstantSDNode>(N);
598 Imm = C->getZExtValue();
599 if (AMDGPU::isValid32BitLiteral(Imm, false))
600 break;
601 }
602
603 SDLoc DL(N);
604 ReplaceNode(N, buildSMovImm64(DL, Imm, N->getValueType(0)));
605 return;
606 }
608 case AMDGPUISD::BFE_U32: {
609 // There is a scalar version available, but unlike the vector version which
610 // has a separate operand for the offset and width, the scalar version packs
611 // the width and offset into a single operand. Try to move to the scalar
612 // version if the offsets are constant, so that we can try to keep extended
613 // loads of kernel arguments in SGPRs.
614
615 // TODO: Technically we could try to pattern match scalar bitshifts of
616 // dynamic values, but it's probably not useful.
617 ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
618 if (!Offset)
619 break;
620
621 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
622 if (!Width)
623 break;
624
625 bool Signed = Opc == AMDGPUISD::BFE_I32;
626
627 uint32_t OffsetVal = Offset->getZExtValue();
628 uint32_t WidthVal = Width->getZExtValue();
629
630 ReplaceNode(N, getBFE32(Signed, SDLoc(N), N->getOperand(0), OffsetVal,
631 WidthVal));
632 return;
633 }
635 SelectDIV_SCALE(N);
636 return;
637 }
640 SelectMAD_64_32(N);
641 return;
642 }
643 case ISD::SMUL_LOHI:
644 case ISD::UMUL_LOHI:
645 return SelectMUL_LOHI(N);
646 case ISD::CopyToReg: {
648 *static_cast<const SITargetLowering*>(getTargetLowering());
649 N = Lowering.legalizeTargetIndependentNode(N, *CurDAG);
650 break;
651 }
652 case ISD::AND:
653 case ISD::SRL:
654 case ISD::SRA:
656 if (N->getValueType(0) != MVT::i32)
657 break;
658
659 SelectS_BFE(N);
660 return;
661 case ISD::BRCOND:
662 SelectBRCOND(N);
663 return;
664 case ISD::FP_EXTEND:
665 SelectFP_EXTEND(N);
666 return;
672 // Hack around using a legal type if f16 is illegal.
673 if (N->getValueType(0) == MVT::i32) {
674 MVT NewVT = Opc == AMDGPUISD::CVT_PKRTZ_F16_F32 ? MVT::v2f16 : MVT::v2i16;
675 N = CurDAG->MorphNodeTo(N, N->getOpcode(), CurDAG->getVTList(NewVT),
676 { N->getOperand(0), N->getOperand(1) });
677 SelectCode(N);
678 return;
679 }
680
681 break;
682 }
684 SelectINTRINSIC_W_CHAIN(N);
685 return;
686 }
688 SelectINTRINSIC_WO_CHAIN(N);
689 return;
690 }
691 case ISD::INTRINSIC_VOID: {
692 SelectINTRINSIC_VOID(N);
693 return;
694 }
696 SelectWAVE_ADDRESS(N);
697 return;
698 }
699 case ISD::STACKRESTORE: {
700 SelectSTACKRESTORE(N);
701 return;
702 }
703 }
704
705 SelectCode(N);
706}
707
708bool AMDGPUDAGToDAGISel::isUniformBr(const SDNode *N) const {
709 const BasicBlock *BB = FuncInfo->MBB->getBasicBlock();
710 const Instruction *Term = BB->getTerminator();
711 return Term->getMetadata("amdgpu.uniform") ||
712 Term->getMetadata("structurizecfg.uniform");
713}
714
715bool AMDGPUDAGToDAGISel::isUnneededShiftMask(const SDNode *N,
716 unsigned ShAmtBits) const {
717 assert(N->getOpcode() == ISD::AND);
718
719 const APInt &RHS = N->getConstantOperandAPInt(1);
720 if (RHS.countr_one() >= ShAmtBits)
721 return true;
722
723 const APInt &LHSKnownZeros = CurDAG->computeKnownBits(N->getOperand(0)).Zero;
724 return (LHSKnownZeros | RHS).countr_one() >= ShAmtBits;
725}
726
728 SDValue &N0, SDValue &N1) {
729 if (Addr.getValueType() == MVT::i64 && Addr.getOpcode() == ISD::BITCAST &&
730 Addr.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
731 // As we split 64-bit `or` earlier, it's complicated pattern to match, i.e.
732 // (i64 (bitcast (v2i32 (build_vector
733 // (or (extract_vector_elt V, 0), OFFSET),
734 // (extract_vector_elt V, 1)))))
735 SDValue Lo = Addr.getOperand(0).getOperand(0);
736 if (Lo.getOpcode() == ISD::OR && DAG.isBaseWithConstantOffset(Lo)) {
737 SDValue BaseLo = Lo.getOperand(0);
738 SDValue BaseHi = Addr.getOperand(0).getOperand(1);
739 // Check that split base (Lo and Hi) are extracted from the same one.
740 if (BaseLo.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
742 BaseLo.getOperand(0) == BaseHi.getOperand(0) &&
743 // Lo is statically extracted from index 0.
744 isa<ConstantSDNode>(BaseLo.getOperand(1)) &&
745 BaseLo.getConstantOperandVal(1) == 0 &&
746 // Hi is statically extracted from index 0.
747 isa<ConstantSDNode>(BaseHi.getOperand(1)) &&
748 BaseHi.getConstantOperandVal(1) == 1) {
749 N0 = BaseLo.getOperand(0).getOperand(0);
750 N1 = Lo.getOperand(1);
751 return true;
752 }
753 }
754 }
755 return false;
756}
757
758bool AMDGPUDAGToDAGISel::isBaseWithConstantOffset64(SDValue Addr, SDValue &LHS,
759 SDValue &RHS) const {
761 LHS = Addr.getOperand(0);
762 RHS = Addr.getOperand(1);
763 return true;
764 }
765
766 if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, LHS, RHS)) {
767 assert(LHS && RHS && isa<ConstantSDNode>(RHS));
768 return true;
769 }
770
771 return false;
772}
773
775 return "AMDGPU DAG->DAG Pattern Instruction Selection";
776}
777
780 std::make_unique<AMDGPUDAGToDAGISel>(TM, TM.getOptLevel())) {}
781
785#ifdef EXPENSIVE_CHECKS
787 .getManager();
788 auto &F = MF.getFunction();
791 for (auto &L : LI.getLoopsInPreorder())
792 assert(L->isLCSSAForm(DT) && "Loop is not in LCSSA form!");
793#endif
794 return SelectionDAGISelPass::run(MF, MFAM);
795}
796
797//===----------------------------------------------------------------------===//
798// Complex Patterns
799//===----------------------------------------------------------------------===//
800
801bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
802 SDValue &Offset) {
803 return false;
804}
805
806bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
807 SDValue &Offset) {
809 SDLoc DL(Addr);
810
811 if ((C = dyn_cast<ConstantSDNode>(Addr))) {
812 Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
813 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
814 } else if ((Addr.getOpcode() == AMDGPUISD::DWORDADDR) &&
815 (C = dyn_cast<ConstantSDNode>(Addr.getOperand(0)))) {
816 Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
817 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
818 } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
819 (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) {
820 Base = Addr.getOperand(0);
821 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
822 } else {
823 Base = Addr;
824 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
825 }
826
827 return true;
828}
829
830SDValue AMDGPUDAGToDAGISel::getMaterializedScalarImm32(int64_t Val,
831 const SDLoc &DL) const {
833 AMDGPU::S_MOV_B32, DL, MVT::i32,
834 CurDAG->getTargetConstant(Val, DL, MVT::i32));
835 return SDValue(Mov, 0);
836}
837
838// FIXME: Should only handle uaddo_carry/usubo_carry
839void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
840 SDLoc DL(N);
841 SDValue LHS = N->getOperand(0);
842 SDValue RHS = N->getOperand(1);
843
844 unsigned Opcode = N->getOpcode();
845 bool ConsumeCarry = (Opcode == ISD::ADDE || Opcode == ISD::SUBE);
846 bool ProduceCarry =
847 ConsumeCarry || Opcode == ISD::ADDC || Opcode == ISD::SUBC;
848 bool IsAdd = Opcode == ISD::ADD || Opcode == ISD::ADDC || Opcode == ISD::ADDE;
849
850 SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
851 SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
852
853 SDNode *Lo0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
854 DL, MVT::i32, LHS, Sub0);
855 SDNode *Hi0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
856 DL, MVT::i32, LHS, Sub1);
857
858 SDNode *Lo1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
859 DL, MVT::i32, RHS, Sub0);
860 SDNode *Hi1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
861 DL, MVT::i32, RHS, Sub1);
862
863 SDVTList VTList = CurDAG->getVTList(MVT::i32, MVT::Glue);
864
865 static const unsigned OpcMap[2][2][2] = {
866 {{AMDGPU::S_SUB_U32, AMDGPU::S_ADD_U32},
867 {AMDGPU::V_SUB_CO_U32_e32, AMDGPU::V_ADD_CO_U32_e32}},
868 {{AMDGPU::S_SUBB_U32, AMDGPU::S_ADDC_U32},
869 {AMDGPU::V_SUBB_U32_e32, AMDGPU::V_ADDC_U32_e32}}};
870
871 unsigned Opc = OpcMap[0][N->isDivergent()][IsAdd];
872 unsigned CarryOpc = OpcMap[1][N->isDivergent()][IsAdd];
873
874 SDNode *AddLo;
875 if (!ConsumeCarry) {
876 SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0) };
877 AddLo = CurDAG->getMachineNode(Opc, DL, VTList, Args);
878 } else {
879 SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0), N->getOperand(2) };
880 AddLo = CurDAG->getMachineNode(CarryOpc, DL, VTList, Args);
881 }
882 SDValue AddHiArgs[] = {
883 SDValue(Hi0, 0),
884 SDValue(Hi1, 0),
885 SDValue(AddLo, 1)
886 };
887 SDNode *AddHi = CurDAG->getMachineNode(CarryOpc, DL, VTList, AddHiArgs);
888
889 SDValue RegSequenceArgs[] = {
890 CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
891 SDValue(AddLo,0),
892 Sub0,
893 SDValue(AddHi,0),
894 Sub1,
895 };
896 SDNode *RegSequence = CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
897 MVT::i64, RegSequenceArgs);
898
899 if (ProduceCarry) {
900 // Replace the carry-use
901 ReplaceUses(SDValue(N, 1), SDValue(AddHi, 1));
902 }
903
904 // Replace the remaining uses.
905 ReplaceNode(N, RegSequence);
906}
907
908void AMDGPUDAGToDAGISel::SelectAddcSubb(SDNode *N) {
909 SDLoc DL(N);
910 SDValue LHS = N->getOperand(0);
911 SDValue RHS = N->getOperand(1);
912 SDValue CI = N->getOperand(2);
913
914 if (N->isDivergent()) {
915 unsigned Opc = N->getOpcode() == ISD::UADDO_CARRY ? AMDGPU::V_ADDC_U32_e64
916 : AMDGPU::V_SUBB_U32_e64;
918 N, Opc, N->getVTList(),
919 {LHS, RHS, CI,
920 CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
921 } else {
922 unsigned Opc = N->getOpcode() == ISD::UADDO_CARRY ? AMDGPU::S_ADD_CO_PSEUDO
923 : AMDGPU::S_SUB_CO_PSEUDO;
924 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), {LHS, RHS, CI});
925 }
926}
927
928void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) {
929 // The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned
930 // carry out despite the _i32 name. These were renamed in VI to _U32.
931 // FIXME: We should probably rename the opcodes here.
932 bool IsAdd = N->getOpcode() == ISD::UADDO;
933 bool IsVALU = N->isDivergent();
934
935 for (SDNode::user_iterator UI = N->user_begin(), E = N->user_end(); UI != E;
936 ++UI)
937 if (UI.getUse().getResNo() == 1) {
938 if ((IsAdd && (UI->getOpcode() != ISD::UADDO_CARRY)) ||
939 (!IsAdd && (UI->getOpcode() != ISD::USUBO_CARRY))) {
940 IsVALU = true;
941 break;
942 }
943 }
944
945 if (IsVALU) {
946 unsigned Opc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
947
949 N, Opc, N->getVTList(),
950 {N->getOperand(0), N->getOperand(1),
951 CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
952 } else {
953 unsigned Opc = N->getOpcode() == ISD::UADDO ? AMDGPU::S_UADDO_PSEUDO
954 : AMDGPU::S_USUBO_PSEUDO;
955
956 CurDAG->SelectNodeTo(N, Opc, N->getVTList(),
957 {N->getOperand(0), N->getOperand(1)});
958 }
959}
960
961void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) {
962 SDLoc SL(N);
963 // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp, omod
964 SDValue Ops[10];
965
966 SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[6], Ops[7]);
967 SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
968 SelectVOP3Mods(N->getOperand(3), Ops[5], Ops[4]);
969 Ops[8] = N->getOperand(0);
970 Ops[9] = N->getOperand(4);
971
972 // If there are no source modifiers, prefer fmac over fma because it can use
973 // the smaller VOP2 encoding.
974 bool UseFMAC = Subtarget->hasDLInsts() &&
975 cast<ConstantSDNode>(Ops[0])->isZero() &&
976 cast<ConstantSDNode>(Ops[2])->isZero() &&
977 cast<ConstantSDNode>(Ops[4])->isZero();
978 unsigned Opcode = UseFMAC ? AMDGPU::V_FMAC_F32_e64 : AMDGPU::V_FMA_F32_e64;
979 CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), Ops);
980}
981
982void AMDGPUDAGToDAGISel::SelectFMUL_W_CHAIN(SDNode *N) {
983 SDLoc SL(N);
984 // src0_modifiers, src0, src1_modifiers, src1, clamp, omod
985 SDValue Ops[8];
986
987 SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[4], Ops[5]);
988 SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
989 Ops[6] = N->getOperand(0);
990 Ops[7] = N->getOperand(3);
991
992 CurDAG->SelectNodeTo(N, AMDGPU::V_MUL_F32_e64, N->getVTList(), Ops);
993}
994
995// We need to handle this here because tablegen doesn't support matching
996// instructions with multiple outputs.
997void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
998 SDLoc SL(N);
999 EVT VT = N->getValueType(0);
1000
1001 assert(VT == MVT::f32 || VT == MVT::f64);
1002
1003 unsigned Opc
1004 = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64_e64 : AMDGPU::V_DIV_SCALE_F32_e64;
1005
1006 // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp,
1007 // omod
1008 SDValue Ops[8];
1009 SelectVOP3BMods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]);
1010 SelectVOP3BMods(N->getOperand(1), Ops[3], Ops[2]);
1011 SelectVOP3BMods(N->getOperand(2), Ops[5], Ops[4]);
1012 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
1013}
1014
1015// We need to handle this here because tablegen doesn't support matching
1016// instructions with multiple outputs.
1017void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {
1018 SDLoc SL(N);
1019 bool Signed = N->getOpcode() == AMDGPUISD::MAD_I64_I32;
1020 unsigned Opc;
1021 if (Subtarget->hasMADIntraFwdBug())
1022 Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
1023 : AMDGPU::V_MAD_U64_U32_gfx11_e64;
1024 else
1025 Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
1026
1027 SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
1028 SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
1029 Clamp };
1030 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
1031}
1032
1033// We need to handle this here because tablegen doesn't support matching
1034// instructions with multiple outputs.
1035void AMDGPUDAGToDAGISel::SelectMUL_LOHI(SDNode *N) {
1036 SDLoc SL(N);
1037 bool Signed = N->getOpcode() == ISD::SMUL_LOHI;
1038 unsigned Opc;
1039 if (Subtarget->hasMADIntraFwdBug())
1040 Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
1041 : AMDGPU::V_MAD_U64_U32_gfx11_e64;
1042 else
1043 Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
1044
1045 SDValue Zero = CurDAG->getTargetConstant(0, SL, MVT::i64);
1046 SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
1047 SDValue Ops[] = {N->getOperand(0), N->getOperand(1), Zero, Clamp};
1048 SDNode *Mad = CurDAG->getMachineNode(Opc, SL, N->getVTList(), Ops);
1049 if (!SDValue(N, 0).use_empty()) {
1050 SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32);
1051 SDNode *Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL,
1052 MVT::i32, SDValue(Mad, 0), Sub0);
1053 ReplaceUses(SDValue(N, 0), SDValue(Lo, 0));
1054 }
1055 if (!SDValue(N, 1).use_empty()) {
1056 SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32);
1057 SDNode *Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL,
1058 MVT::i32, SDValue(Mad, 0), Sub1);
1059 ReplaceUses(SDValue(N, 1), SDValue(Hi, 0));
1060 }
1062}
1063
1064bool AMDGPUDAGToDAGISel::isDSOffsetLegal(SDValue Base, unsigned Offset) const {
1065 if (!isUInt<16>(Offset))
1066 return false;
1067
1068 if (!Base || Subtarget->hasUsableDSOffset() ||
1069 Subtarget->unsafeDSOffsetFoldingEnabled())
1070 return true;
1071
1072 // On Southern Islands instruction with a negative base value and an offset
1073 // don't seem to work.
1074 return CurDAG->SignBitIsZero(Base);
1075}
1076
1077bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
1078 SDValue &Offset) const {
1079 SDLoc DL(Addr);
1081 SDValue N0 = Addr.getOperand(0);
1082 SDValue N1 = Addr.getOperand(1);
1083 ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1084 if (isDSOffsetLegal(N0, C1->getSExtValue())) {
1085 // (add n0, c0)
1086 Base = N0;
1087 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
1088 return true;
1089 }
1090 } else if (Addr.getOpcode() == ISD::SUB) {
1091 // sub C, x -> add (sub 0, x), C
1092 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
1093 int64_t ByteOffset = C->getSExtValue();
1094 if (isDSOffsetLegal(SDValue(), ByteOffset)) {
1095 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1096
1097 // XXX - This is kind of hacky. Create a dummy sub node so we can check
1098 // the known bits in isDSOffsetLegal. We need to emit the selected node
1099 // here, so this is thrown away.
1100 SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32,
1101 Zero, Addr.getOperand(1));
1102
1103 if (isDSOffsetLegal(Sub, ByteOffset)) {
1105 Opnds.push_back(Zero);
1106 Opnds.push_back(Addr.getOperand(1));
1107
1108 // FIXME: Select to VOP3 version for with-carry.
1109 unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1110 if (Subtarget->hasAddNoCarry()) {
1111 SubOp = AMDGPU::V_SUB_U32_e64;
1112 Opnds.push_back(
1113 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
1114 }
1115
1116 MachineSDNode *MachineSub =
1117 CurDAG->getMachineNode(SubOp, DL, MVT::i32, Opnds);
1118
1119 Base = SDValue(MachineSub, 0);
1120 Offset = CurDAG->getTargetConstant(ByteOffset, DL, MVT::i16);
1121 return true;
1122 }
1123 }
1124 }
1125 } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1126 // If we have a constant address, prefer to put the constant into the
1127 // offset. This can save moves to load the constant address since multiple
1128 // operations can share the zero base address register, and enables merging
1129 // into read2 / write2 instructions.
1130
1131 SDLoc DL(Addr);
1132
1133 if (isDSOffsetLegal(SDValue(), CAddr->getZExtValue())) {
1134 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1135 MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
1136 DL, MVT::i32, Zero);
1137 Base = SDValue(MovZero, 0);
1138 Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16);
1139 return true;
1140 }
1141 }
1142
1143 // default case
1144 Base = Addr;
1145 Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i16);
1146 return true;
1147}
1148
1149bool AMDGPUDAGToDAGISel::isDSOffset2Legal(SDValue Base, unsigned Offset0,
1150 unsigned Offset1,
1151 unsigned Size) const {
1152 if (Offset0 % Size != 0 || Offset1 % Size != 0)
1153 return false;
1154 if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
1155 return false;
1156
1157 if (!Base || Subtarget->hasUsableDSOffset() ||
1158 Subtarget->unsafeDSOffsetFoldingEnabled())
1159 return true;
1160
1161 // On Southern Islands instruction with a negative base value and an offset
1162 // don't seem to work.
1163 return CurDAG->SignBitIsZero(Base);
1164}
1165
1166// Return whether the operation has NoUnsignedWrap property.
1168 return (Addr.getOpcode() == ISD::ADD &&
1169 Addr->getFlags().hasNoUnsignedWrap()) ||
1170 Addr->getOpcode() == ISD::OR;
1171}
1172
1173// Check that the base address of flat scratch load/store in the form of `base +
1174// offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware
1175// requirement). We always treat the first operand as the base address here.
1176bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue Addr) const {
1178 return true;
1179
1180 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1181 // values.
1182 if (Subtarget->hasSignedScratchOffsets())
1183 return true;
1184
1185 auto LHS = Addr.getOperand(0);
1186 auto RHS = Addr.getOperand(1);
1187
1188 // If the immediate offset is negative and within certain range, the base
1189 // address cannot also be negative. If the base is also negative, the sum
1190 // would be either negative or much larger than the valid range of scratch
1191 // memory a thread can access.
1192 ConstantSDNode *ImmOp = nullptr;
1193 if (Addr.getOpcode() == ISD::ADD && (ImmOp = dyn_cast<ConstantSDNode>(RHS))) {
1194 if (ImmOp->getSExtValue() < 0 && ImmOp->getSExtValue() > -0x40000000)
1195 return true;
1196 }
1197
1198 return CurDAG->SignBitIsZero(LHS);
1199}
1200
1201// Check address value in SGPR/VGPR are legal for flat scratch in the form
1202// of: SGPR + VGPR.
1203bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSV(SDValue Addr) const {
1205 return true;
1206
1207 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1208 // values.
1209 if (Subtarget->hasSignedScratchOffsets())
1210 return true;
1211
1212 auto LHS = Addr.getOperand(0);
1213 auto RHS = Addr.getOperand(1);
1214 return CurDAG->SignBitIsZero(RHS) && CurDAG->SignBitIsZero(LHS);
1215}
1216
1217// Check address value in SGPR/VGPR are legal for flat scratch in the form
1218// of: SGPR + VGPR + Imm.
1219bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSVImm(SDValue Addr) const {
1220 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
1221 // values.
1222 if (AMDGPU::isGFX12Plus(*Subtarget))
1223 return true;
1224
1225 auto Base = Addr.getOperand(0);
1226 auto *RHSImm = cast<ConstantSDNode>(Addr.getOperand(1));
1227 // If the immediate offset is negative and within certain range, the base
1228 // address cannot also be negative. If the base is also negative, the sum
1229 // would be either negative or much larger than the valid range of scratch
1230 // memory a thread can access.
1231 if (isNoUnsignedWrap(Base) &&
1233 (RHSImm->getSExtValue() < 0 && RHSImm->getSExtValue() > -0x40000000)))
1234 return true;
1235
1236 auto LHS = Base.getOperand(0);
1237 auto RHS = Base.getOperand(1);
1238 return CurDAG->SignBitIsZero(RHS) && CurDAG->SignBitIsZero(LHS);
1239}
1240
1241// TODO: If offset is too big, put low 16-bit into offset.
1242bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,
1243 SDValue &Offset0,
1244 SDValue &Offset1) const {
1245 return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 4);
1246}
1247
1248bool AMDGPUDAGToDAGISel::SelectDS128Bit8ByteAligned(SDValue Addr, SDValue &Base,
1249 SDValue &Offset0,
1250 SDValue &Offset1) const {
1251 return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 8);
1252}
1253
1254bool AMDGPUDAGToDAGISel::SelectDSReadWrite2(SDValue Addr, SDValue &Base,
1255 SDValue &Offset0, SDValue &Offset1,
1256 unsigned Size) const {
1257 SDLoc DL(Addr);
1258
1260 SDValue N0 = Addr.getOperand(0);
1261 SDValue N1 = Addr.getOperand(1);
1262 ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1263 unsigned OffsetValue0 = C1->getZExtValue();
1264 unsigned OffsetValue1 = OffsetValue0 + Size;
1265
1266 // (add n0, c0)
1267 if (isDSOffset2Legal(N0, OffsetValue0, OffsetValue1, Size)) {
1268 Base = N0;
1269 Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i32);
1270 Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i32);
1271 return true;
1272 }
1273 } else if (Addr.getOpcode() == ISD::SUB) {
1274 // sub C, x -> add (sub 0, x), C
1275 if (const ConstantSDNode *C =
1276 dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
1277 unsigned OffsetValue0 = C->getZExtValue();
1278 unsigned OffsetValue1 = OffsetValue0 + Size;
1279
1280 if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {
1281 SDLoc DL(Addr);
1282 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1283
1284 // XXX - This is kind of hacky. Create a dummy sub node so we can check
1285 // the known bits in isDSOffsetLegal. We need to emit the selected node
1286 // here, so this is thrown away.
1287 SDValue Sub =
1288 CurDAG->getNode(ISD::SUB, DL, MVT::i32, Zero, Addr.getOperand(1));
1289
1290 if (isDSOffset2Legal(Sub, OffsetValue0, OffsetValue1, Size)) {
1292 Opnds.push_back(Zero);
1293 Opnds.push_back(Addr.getOperand(1));
1294 unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1295 if (Subtarget->hasAddNoCarry()) {
1296 SubOp = AMDGPU::V_SUB_U32_e64;
1297 Opnds.push_back(
1298 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
1299 }
1300
1301 MachineSDNode *MachineSub = CurDAG->getMachineNode(
1302 SubOp, DL, MVT::getIntegerVT(Size * 8), Opnds);
1303
1304 Base = SDValue(MachineSub, 0);
1305 Offset0 =
1306 CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i32);
1307 Offset1 =
1308 CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i32);
1309 return true;
1310 }
1311 }
1312 }
1313 } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1314 unsigned OffsetValue0 = CAddr->getZExtValue();
1315 unsigned OffsetValue1 = OffsetValue0 + Size;
1316
1317 if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {
1318 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1319 MachineSDNode *MovZero =
1320 CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32, Zero);
1321 Base = SDValue(MovZero, 0);
1322 Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i32);
1323 Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i32);
1324 return true;
1325 }
1326 }
1327
1328 // default case
1329
1330 Base = Addr;
1331 Offset0 = CurDAG->getTargetConstant(0, DL, MVT::i32);
1332 Offset1 = CurDAG->getTargetConstant(1, DL, MVT::i32);
1333 return true;
1334}
1335
1336bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDValue &VAddr,
1337 SDValue &SOffset, SDValue &Offset,
1338 SDValue &Offen, SDValue &Idxen,
1339 SDValue &Addr64) const {
1340 // Subtarget prefers to use flat instruction
1341 // FIXME: This should be a pattern predicate and not reach here
1342 if (Subtarget->useFlatForGlobal())
1343 return false;
1344
1345 SDLoc DL(Addr);
1346
1347 Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1);
1348 Offen = CurDAG->getTargetConstant(0, DL, MVT::i1);
1349 Addr64 = CurDAG->getTargetConstant(0, DL, MVT::i1);
1350 SOffset = Subtarget->hasRestrictedSOffset()
1351 ? CurDAG->getRegister(AMDGPU::SGPR_NULL, MVT::i32)
1352 : CurDAG->getTargetConstant(0, DL, MVT::i32);
1353
1354 ConstantSDNode *C1 = nullptr;
1355 SDValue N0 = Addr;
1357 C1 = cast<ConstantSDNode>(Addr.getOperand(1));
1358 if (isUInt<32>(C1->getZExtValue()))
1359 N0 = Addr.getOperand(0);
1360 else
1361 C1 = nullptr;
1362 }
1363
1364 if (N0.getOpcode() == ISD::ADD) {
1365 // (add N2, N3) -> addr64, or
1366 // (add (add N2, N3), C1) -> addr64
1367 SDValue N2 = N0.getOperand(0);
1368 SDValue N3 = N0.getOperand(1);
1369 Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
1370
1371 if (N2->isDivergent()) {
1372 if (N3->isDivergent()) {
1373 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
1374 // addr64, and construct the resource from a 0 address.
1375 Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
1376 VAddr = N0;
1377 } else {
1378 // N2 is divergent, N3 is not.
1379 Ptr = N3;
1380 VAddr = N2;
1381 }
1382 } else {
1383 // N2 is not divergent.
1384 Ptr = N2;
1385 VAddr = N3;
1386 }
1387 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1388 } else if (N0->isDivergent()) {
1389 // N0 is divergent. Use it as the addr64, and construct the resource from a
1390 // 0 address.
1391 Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
1392 VAddr = N0;
1393 Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
1394 } else {
1395 // N0 -> offset, or
1396 // (N0 + C1) -> offset
1397 VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32);
1398 Ptr = N0;
1399 }
1400
1401 if (!C1) {
1402 // No offset.
1403 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1404 return true;
1405 }
1406
1407 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1408 if (TII->isLegalMUBUFImmOffset(C1->getZExtValue())) {
1409 // Legal offset for instruction.
1410 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
1411 return true;
1412 }
1413
1414 // Illegal offset, store it in soffset.
1415 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1416 SOffset =
1418 AMDGPU::S_MOV_B32, DL, MVT::i32,
1419 CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32)),
1420 0);
1421 return true;
1422}
1423
1424bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
1425 SDValue &VAddr, SDValue &SOffset,
1426 SDValue &Offset) const {
1427 SDValue Ptr, Offen, Idxen, Addr64;
1428
1429 // addr64 bit was removed for volcanic islands.
1430 // FIXME: This should be a pattern predicate and not reach here
1431 if (!Subtarget->hasAddr64())
1432 return false;
1433
1434 if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
1435 return false;
1436
1437 ConstantSDNode *C = cast<ConstantSDNode>(Addr64);
1438 if (C->getSExtValue()) {
1439 SDLoc DL(Addr);
1440
1441 const SITargetLowering& Lowering =
1442 *static_cast<const SITargetLowering*>(getTargetLowering());
1443
1444 SRsrc = SDValue(Lowering.wrapAddr64Rsrc(*CurDAG, DL, Ptr), 0);
1445 return true;
1446 }
1447
1448 return false;
1449}
1450
1451std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const {
1452 SDLoc DL(N);
1453
1454 auto *FI = dyn_cast<FrameIndexSDNode>(N);
1455 SDValue TFI =
1456 FI ? CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0)) : N;
1457
1458 // We rebase the base address into an absolute stack address and hence
1459 // use constant 0 for soffset. This value must be retained until
1460 // frame elimination and eliminateFrameIndex will choose the appropriate
1461 // frame register if need be.
1462 return std::pair(TFI, CurDAG->getTargetConstant(0, DL, MVT::i32));
1463}
1464
1465bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent,
1466 SDValue Addr, SDValue &Rsrc,
1467 SDValue &VAddr, SDValue &SOffset,
1468 SDValue &ImmOffset) const {
1469
1470 SDLoc DL(Addr);
1473
1474 Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1475
1476 if (ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1477 int64_t Imm = CAddr->getSExtValue();
1478 const int64_t NullPtr =
1480 // Don't fold null pointer.
1481 if (Imm != NullPtr) {
1482 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
1483 SDValue HighBits =
1484 CurDAG->getTargetConstant(Imm & ~MaxOffset, DL, MVT::i32);
1485 MachineSDNode *MovHighBits = CurDAG->getMachineNode(
1486 AMDGPU::V_MOV_B32_e32, DL, MVT::i32, HighBits);
1487 VAddr = SDValue(MovHighBits, 0);
1488
1489 SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1490 ImmOffset = CurDAG->getTargetConstant(Imm & MaxOffset, DL, MVT::i32);
1491 return true;
1492 }
1493 }
1494
1496 // (add n0, c1)
1497
1498 SDValue N0 = Addr.getOperand(0);
1499 uint64_t C1 = Addr.getConstantOperandVal(1);
1500
1501 // Offsets in vaddr must be positive if range checking is enabled.
1502 //
1503 // The total computation of vaddr + soffset + offset must not overflow. If
1504 // vaddr is negative, even if offset is 0 the sgpr offset add will end up
1505 // overflowing.
1506 //
1507 // Prior to gfx9, MUBUF instructions with the vaddr offset enabled would
1508 // always perform a range check. If a negative vaddr base index was used,
1509 // this would fail the range check. The overall address computation would
1510 // compute a valid address, but this doesn't happen due to the range
1511 // check. For out-of-bounds MUBUF loads, a 0 is returned.
1512 //
1513 // Therefore it should be safe to fold any VGPR offset on gfx9 into the
1514 // MUBUF vaddr, but not on older subtargets which can only do this if the
1515 // sign bit is known 0.
1516 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1517 if (TII->isLegalMUBUFImmOffset(C1) &&
1518 (!Subtarget->privateMemoryResourceIsRangeChecked() ||
1519 CurDAG->SignBitIsZero(N0))) {
1520 std::tie(VAddr, SOffset) = foldFrameIndex(N0);
1521 ImmOffset = CurDAG->getTargetConstant(C1, DL, MVT::i32);
1522 return true;
1523 }
1524 }
1525
1526 // (node)
1527 std::tie(VAddr, SOffset) = foldFrameIndex(Addr);
1528 ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1529 return true;
1530}
1531
1532static bool IsCopyFromSGPR(const SIRegisterInfo &TRI, SDValue Val) {
1533 if (Val.getOpcode() != ISD::CopyFromReg)
1534 return false;
1535 auto Reg = cast<RegisterSDNode>(Val.getOperand(1))->getReg();
1536 if (!Reg.isPhysical())
1537 return false;
1538 const auto *RC = TRI.getPhysRegBaseClass(Reg);
1539 return RC && TRI.isSGPRClass(RC);
1540}
1541
1542bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent,
1543 SDValue Addr,
1544 SDValue &SRsrc,
1545 SDValue &SOffset,
1546 SDValue &Offset) const {
1547 const SIRegisterInfo *TRI =
1548 static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
1549 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1552 SDLoc DL(Addr);
1553
1554 // CopyFromReg <sgpr>
1555 if (IsCopyFromSGPR(*TRI, Addr)) {
1556 SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1557 SOffset = Addr;
1558 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1559 return true;
1560 }
1561
1562 ConstantSDNode *CAddr;
1563 if (Addr.getOpcode() == ISD::ADD) {
1564 // Add (CopyFromReg <sgpr>) <constant>
1565 CAddr = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
1566 if (!CAddr || !TII->isLegalMUBUFImmOffset(CAddr->getZExtValue()))
1567 return false;
1568 if (!IsCopyFromSGPR(*TRI, Addr.getOperand(0)))
1569 return false;
1570
1571 SOffset = Addr.getOperand(0);
1572 } else if ((CAddr = dyn_cast<ConstantSDNode>(Addr)) &&
1573 TII->isLegalMUBUFImmOffset(CAddr->getZExtValue())) {
1574 // <constant>
1575 SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1576 } else {
1577 return false;
1578 }
1579
1580 SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1581
1582 Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i32);
1583 return true;
1584}
1585
1586bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
1587 SDValue &SOffset, SDValue &Offset
1588 ) const {
1589 SDValue Ptr, VAddr, Offen, Idxen, Addr64;
1590 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1591
1592 if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
1593 return false;
1594
1595 if (!cast<ConstantSDNode>(Offen)->getSExtValue() &&
1596 !cast<ConstantSDNode>(Idxen)->getSExtValue() &&
1597 !cast<ConstantSDNode>(Addr64)->getSExtValue()) {
1598 uint64_t Rsrc = TII->getDefaultRsrcDataFormat() |
1599 maskTrailingOnes<uint64_t>(32); // Size
1600 SDLoc DL(Addr);
1601
1602 const SITargetLowering& Lowering =
1603 *static_cast<const SITargetLowering*>(getTargetLowering());
1604
1605 SRsrc = SDValue(Lowering.buildRSRC(*CurDAG, DL, Ptr, 0, Rsrc), 0);
1606 return true;
1607 }
1608 return false;
1609}
1610
1611bool AMDGPUDAGToDAGISel::SelectBUFSOffset(SDValue ByteOffsetNode,
1612 SDValue &SOffset) const {
1613 if (Subtarget->hasRestrictedSOffset() && isNullConstant(ByteOffsetNode)) {
1614 SOffset = CurDAG->getRegister(AMDGPU::SGPR_NULL, MVT::i32);
1615 return true;
1616 }
1617
1618 SOffset = ByteOffsetNode;
1619 return true;
1620}
1621
1622// Find a load or store from corresponding pattern root.
1623// Roots may be build_vector, bitconvert or their combinations.
1626 if (MemSDNode *MN = dyn_cast<MemSDNode>(N))
1627 return MN;
1628 assert(isa<BuildVectorSDNode>(N));
1629 for (SDValue V : N->op_values())
1630 if (MemSDNode *MN =
1631 dyn_cast<MemSDNode>(AMDGPUTargetLowering::stripBitcast(V)))
1632 return MN;
1633 llvm_unreachable("cannot find MemSDNode in the pattern!");
1634}
1635
1636bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr,
1637 SDValue &VAddr, SDValue &Offset,
1638 uint64_t FlatVariant) const {
1639 int64_t OffsetVal = 0;
1640
1641 unsigned AS = findMemSDNode(N)->getAddressSpace();
1642
1643 bool CanHaveFlatSegmentOffsetBug =
1644 Subtarget->hasFlatSegmentOffsetBug() &&
1645 FlatVariant == SIInstrFlags::FLAT &&
1647
1648 if (Subtarget->hasFlatInstOffsets() && !CanHaveFlatSegmentOffsetBug) {
1649 SDValue N0, N1;
1650 if (isBaseWithConstantOffset64(Addr, N0, N1) &&
1651 (FlatVariant != SIInstrFlags::FlatScratch ||
1652 isFlatScratchBaseLegal(Addr))) {
1653 int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
1654
1655 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1656 if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) {
1657 Addr = N0;
1658 OffsetVal = COffsetVal;
1659 } else {
1660 // If the offset doesn't fit, put the low bits into the offset field and
1661 // add the rest.
1662 //
1663 // For a FLAT instruction the hardware decides whether to access
1664 // global/scratch/shared memory based on the high bits of vaddr,
1665 // ignoring the offset field, so we have to ensure that when we add
1666 // remainder to vaddr it still points into the same underlying object.
1667 // The easiest way to do that is to make sure that we split the offset
1668 // into two pieces that are both >= 0 or both <= 0.
1669
1670 SDLoc DL(N);
1671 uint64_t RemainderOffset;
1672
1673 std::tie(OffsetVal, RemainderOffset) =
1674 TII->splitFlatOffset(COffsetVal, AS, FlatVariant);
1675
1676 SDValue AddOffsetLo =
1677 getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
1678 SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
1679
1680 if (Addr.getValueType().getSizeInBits() == 32) {
1682 Opnds.push_back(N0);
1683 Opnds.push_back(AddOffsetLo);
1684 unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32;
1685 if (Subtarget->hasAddNoCarry()) {
1686 AddOp = AMDGPU::V_ADD_U32_e64;
1687 Opnds.push_back(Clamp);
1688 }
1689 Addr = SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0);
1690 } else {
1691 // TODO: Should this try to use a scalar add pseudo if the base address
1692 // is uniform and saddr is usable?
1693 SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
1694 SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
1695
1696 SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1697 DL, MVT::i32, N0, Sub0);
1698 SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1699 DL, MVT::i32, N0, Sub1);
1700
1701 SDValue AddOffsetHi =
1702 getMaterializedScalarImm32(Hi_32(RemainderOffset), DL);
1703
1704 SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1);
1705
1706 SDNode *Add =
1707 CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs,
1708 {AddOffsetLo, SDValue(N0Lo, 0), Clamp});
1709
1710 SDNode *Addc = CurDAG->getMachineNode(
1711 AMDGPU::V_ADDC_U32_e64, DL, VTs,
1712 {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp});
1713
1714 SDValue RegSequenceArgs[] = {
1715 CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32),
1716 SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1};
1717
1718 Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
1719 MVT::i64, RegSequenceArgs),
1720 0);
1721 }
1722 }
1723 }
1724 }
1725
1726 VAddr = Addr;
1727 Offset = CurDAG->getSignedTargetConstant(OffsetVal, SDLoc(), MVT::i32);
1728 return true;
1729}
1730
1731bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N, SDValue Addr,
1732 SDValue &VAddr,
1733 SDValue &Offset) const {
1734 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, SIInstrFlags::FLAT);
1735}
1736
1737bool AMDGPUDAGToDAGISel::SelectGlobalOffset(SDNode *N, SDValue Addr,
1738 SDValue &VAddr,
1739 SDValue &Offset) const {
1740 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, SIInstrFlags::FlatGlobal);
1741}
1742
1743bool AMDGPUDAGToDAGISel::SelectScratchOffset(SDNode *N, SDValue Addr,
1744 SDValue &VAddr,
1745 SDValue &Offset) const {
1746 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset,
1748}
1749
1750// If this matches zero_extend i32:x, return x
1752 if (Op.getOpcode() != ISD::ZERO_EXTEND)
1753 return SDValue();
1754
1755 SDValue ExtSrc = Op.getOperand(0);
1756 return (ExtSrc.getValueType() == MVT::i32) ? ExtSrc : SDValue();
1757}
1758
1759// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
1760bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
1761 SDValue Addr,
1762 SDValue &SAddr,
1763 SDValue &VOffset,
1764 SDValue &Offset) const {
1765 int64_t ImmOffset = 0;
1766
1767 // Match the immediate offset first, which canonically is moved as low as
1768 // possible.
1769
1770 SDValue LHS, RHS;
1771 if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
1772 int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
1773 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1774
1775 if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS,
1777 Addr = LHS;
1778 ImmOffset = COffsetVal;
1779 } else if (!LHS->isDivergent()) {
1780 if (COffsetVal > 0) {
1781 SDLoc SL(N);
1782 // saddr + large_offset -> saddr +
1783 // (voffset = large_offset & ~MaxOffset) +
1784 // (large_offset & MaxOffset);
1785 int64_t SplitImmOffset, RemainderOffset;
1786 std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
1788
1789 if (isUInt<32>(RemainderOffset)) {
1790 SDNode *VMov = CurDAG->getMachineNode(
1791 AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
1792 CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
1793 VOffset = SDValue(VMov, 0);
1794 SAddr = LHS;
1795 Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i32);
1796 return true;
1797 }
1798 }
1799
1800 // We are adding a 64 bit SGPR and a constant. If constant bus limit
1801 // is 1 we would need to perform 1 or 2 extra moves for each half of
1802 // the constant and it is better to do a scalar add and then issue a
1803 // single VALU instruction to materialize zero. Otherwise it is less
1804 // instructions to perform VALU adds with immediates or inline literals.
1805 unsigned NumLiterals =
1806 !TII->isInlineConstant(APInt(32, Lo_32(COffsetVal))) +
1807 !TII->isInlineConstant(APInt(32, Hi_32(COffsetVal)));
1808 if (Subtarget->getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
1809 return false;
1810 }
1811 }
1812
1813 // Match the variable offset.
1814 if (Addr.getOpcode() == ISD::ADD) {
1815 LHS = Addr.getOperand(0);
1816 RHS = Addr.getOperand(1);
1817
1818 if (!LHS->isDivergent()) {
1819 // add (i64 sgpr), (zero_extend (i32 vgpr))
1820 if (SDValue ZextRHS = matchZExtFromI32(RHS)) {
1821 SAddr = LHS;
1822 VOffset = ZextRHS;
1823 }
1824 }
1825
1826 if (!SAddr && !RHS->isDivergent()) {
1827 // add (zero_extend (i32 vgpr)), (i64 sgpr)
1828 if (SDValue ZextLHS = matchZExtFromI32(LHS)) {
1829 SAddr = RHS;
1830 VOffset = ZextLHS;
1831 }
1832 }
1833
1834 if (SAddr) {
1835 Offset = CurDAG->getSignedTargetConstant(ImmOffset, SDLoc(), MVT::i32);
1836 return true;
1837 }
1838 }
1839
1840 if (Addr->isDivergent() || Addr.getOpcode() == ISD::UNDEF ||
1841 isa<ConstantSDNode>(Addr))
1842 return false;
1843
1844 // It's cheaper to materialize a single 32-bit zero for vaddr than the two
1845 // moves required to copy a 64-bit SGPR to VGPR.
1846 SAddr = Addr;
1847 SDNode *VMov =
1848 CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, SDLoc(Addr), MVT::i32,
1849 CurDAG->getTargetConstant(0, SDLoc(), MVT::i32));
1850 VOffset = SDValue(VMov, 0);
1851 Offset = CurDAG->getSignedTargetConstant(ImmOffset, SDLoc(), MVT::i32);
1852 return true;
1853}
1854
1856 if (auto *FI = dyn_cast<FrameIndexSDNode>(SAddr)) {
1857 SAddr = CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0));
1858 } else if (SAddr.getOpcode() == ISD::ADD &&
1859 isa<FrameIndexSDNode>(SAddr.getOperand(0))) {
1860 // Materialize this into a scalar move for scalar address to avoid
1861 // readfirstlane.
1862 auto *FI = cast<FrameIndexSDNode>(SAddr.getOperand(0));
1863 SDValue TFI = CurDAG->getTargetFrameIndex(FI->getIndex(),
1864 FI->getValueType(0));
1865 SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, SDLoc(SAddr),
1866 MVT::i32, TFI, SAddr.getOperand(1)),
1867 0);
1868 }
1869
1870 return SAddr;
1871}
1872
1873// Match (32-bit SGPR base) + sext(imm offset)
1874bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr,
1875 SDValue &SAddr,
1876 SDValue &Offset) const {
1877 if (Addr->isDivergent())
1878 return false;
1879
1880 SDLoc DL(Addr);
1881
1882 int64_t COffsetVal = 0;
1883
1884 if (CurDAG->isBaseWithConstantOffset(Addr) && isFlatScratchBaseLegal(Addr)) {
1885 COffsetVal = cast<ConstantSDNode>(Addr.getOperand(1))->getSExtValue();
1886 SAddr = Addr.getOperand(0);
1887 } else {
1888 SAddr = Addr;
1889 }
1890
1891 SAddr = SelectSAddrFI(CurDAG, SAddr);
1892
1893 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1894
1895 if (!TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS,
1897 int64_t SplitImmOffset, RemainderOffset;
1898 std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
1900
1901 COffsetVal = SplitImmOffset;
1902
1903 SDValue AddOffset =
1905 ? getMaterializedScalarImm32(Lo_32(RemainderOffset), DL)
1906 : CurDAG->getSignedTargetConstant(RemainderOffset, DL, MVT::i32);
1907 SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, DL, MVT::i32,
1908 SAddr, AddOffset),
1909 0);
1910 }
1911
1912 Offset = CurDAG->getSignedTargetConstant(COffsetVal, DL, MVT::i32);
1913
1914 return true;
1915}
1916
1917// Check whether the flat scratch SVS swizzle bug affects this access.
1918bool AMDGPUDAGToDAGISel::checkFlatScratchSVSSwizzleBug(
1919 SDValue VAddr, SDValue SAddr, uint64_t ImmOffset) const {
1920 if (!Subtarget->hasFlatScratchSVSSwizzleBug())
1921 return false;
1922
1923 // The bug affects the swizzling of SVS accesses if there is any carry out
1924 // from the two low order bits (i.e. from bit 1 into bit 2) when adding
1925 // voffset to (soffset + inst_offset).
1926 KnownBits VKnown = CurDAG->computeKnownBits(VAddr);
1927 KnownBits SKnown =
1929 KnownBits::makeConstant(APInt(32, ImmOffset,
1930 /*isSigned=*/true)));
1931 uint64_t VMax = VKnown.getMaxValue().getZExtValue();
1933 return (VMax & 3) + (SMax & 3) >= 4;
1934}
1935
1936bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
1937 SDValue &VAddr, SDValue &SAddr,
1938 SDValue &Offset) const {
1939 int64_t ImmOffset = 0;
1940
1941 SDValue LHS, RHS;
1942 SDValue OrigAddr = Addr;
1943 if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
1944 int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
1945 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1946
1947 if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, true)) {
1948 Addr = LHS;
1949 ImmOffset = COffsetVal;
1950 } else if (!LHS->isDivergent() && COffsetVal > 0) {
1951 SDLoc SL(N);
1952 // saddr + large_offset -> saddr + (vaddr = large_offset & ~MaxOffset) +
1953 // (large_offset & MaxOffset);
1954 int64_t SplitImmOffset, RemainderOffset;
1955 std::tie(SplitImmOffset, RemainderOffset)
1956 = TII->splitFlatOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, true);
1957
1958 if (isUInt<32>(RemainderOffset)) {
1959 SDNode *VMov = CurDAG->getMachineNode(
1960 AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
1961 CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
1962 VAddr = SDValue(VMov, 0);
1963 SAddr = LHS;
1964 if (!isFlatScratchBaseLegal(Addr))
1965 return false;
1966 if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, SplitImmOffset))
1967 return false;
1968 Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i32);
1969 return true;
1970 }
1971 }
1972 }
1973
1974 if (Addr.getOpcode() != ISD::ADD)
1975 return false;
1976
1977 LHS = Addr.getOperand(0);
1978 RHS = Addr.getOperand(1);
1979
1980 if (!LHS->isDivergent() && RHS->isDivergent()) {
1981 SAddr = LHS;
1982 VAddr = RHS;
1983 } else if (!RHS->isDivergent() && LHS->isDivergent()) {
1984 SAddr = RHS;
1985 VAddr = LHS;
1986 } else {
1987 return false;
1988 }
1989
1990 if (OrigAddr != Addr) {
1991 if (!isFlatScratchBaseLegalSVImm(OrigAddr))
1992 return false;
1993 } else {
1994 if (!isFlatScratchBaseLegalSV(OrigAddr))
1995 return false;
1996 }
1997
1998 if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, ImmOffset))
1999 return false;
2000 SAddr = SelectSAddrFI(CurDAG, SAddr);
2001 Offset = CurDAG->getSignedTargetConstant(ImmOffset, SDLoc(), MVT::i32);
2002 return true;
2003}
2004
2005// For unbuffered smem loads, it is illegal for the Immediate Offset to be
2006// negative if the resulting (Offset + (M0 or SOffset or zero) is negative.
2007// Handle the case where the Immediate Offset + SOffset is negative.
2008bool AMDGPUDAGToDAGISel::isSOffsetLegalWithImmOffset(SDValue *SOffset,
2009 bool Imm32Only,
2010 bool IsBuffer,
2011 int64_t ImmOffset) const {
2012 if (!IsBuffer && !Imm32Only && ImmOffset < 0 &&
2013 AMDGPU::hasSMRDSignedImmOffset(*Subtarget)) {
2014 KnownBits SKnown = CurDAG->computeKnownBits(*SOffset);
2015 if (ImmOffset + SKnown.getMinValue().getSExtValue() < 0)
2016 return false;
2017 }
2018
2019 return true;
2020}
2021
2022// Match an immediate (if Offset is not null) or an SGPR (if SOffset is
2023// not null) offset. If Imm32Only is true, match only 32-bit immediate
2024// offsets available on CI.
2025bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
2026 SDValue *SOffset, SDValue *Offset,
2027 bool Imm32Only, bool IsBuffer,
2028 bool HasSOffset,
2029 int64_t ImmOffset) const {
2030 assert((!SOffset || !Offset) &&
2031 "Cannot match both soffset and offset at the same time!");
2032
2033 ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode);
2034 if (!C) {
2035 if (!SOffset)
2036 return false;
2037
2038 if (ByteOffsetNode.getValueType().isScalarInteger() &&
2039 ByteOffsetNode.getValueType().getSizeInBits() == 32) {
2040 *SOffset = ByteOffsetNode;
2041 return isSOffsetLegalWithImmOffset(SOffset, Imm32Only, IsBuffer,
2042 ImmOffset);
2043 }
2044 if (ByteOffsetNode.getOpcode() == ISD::ZERO_EXTEND) {
2045 if (ByteOffsetNode.getOperand(0).getValueType().getSizeInBits() == 32) {
2046 *SOffset = ByteOffsetNode.getOperand(0);
2047 return isSOffsetLegalWithImmOffset(SOffset, Imm32Only, IsBuffer,
2048 ImmOffset);
2049 }
2050 }
2051 return false;
2052 }
2053
2054 SDLoc SL(ByteOffsetNode);
2055
2056 // GFX9 and GFX10 have signed byte immediate offsets. The immediate
2057 // offset for S_BUFFER instructions is unsigned.
2058 int64_t ByteOffset = IsBuffer ? C->getZExtValue() : C->getSExtValue();
2059 std::optional<int64_t> EncodedOffset = AMDGPU::getSMRDEncodedOffset(
2060 *Subtarget, ByteOffset, IsBuffer, HasSOffset);
2061 if (EncodedOffset && Offset && !Imm32Only) {
2062 *Offset = CurDAG->getSignedTargetConstant(*EncodedOffset, SL, MVT::i32);
2063 return true;
2064 }
2065
2066 // SGPR and literal offsets are unsigned.
2067 if (ByteOffset < 0)
2068 return false;
2069
2070 EncodedOffset = AMDGPU::getSMRDEncodedLiteralOffset32(*Subtarget, ByteOffset);
2071 if (EncodedOffset && Offset && Imm32Only) {
2072 *Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);
2073 return true;
2074 }
2075
2076 if (!isUInt<32>(ByteOffset) && !isInt<32>(ByteOffset))
2077 return false;
2078
2079 if (SOffset) {
2080 SDValue C32Bit = CurDAG->getTargetConstant(ByteOffset, SL, MVT::i32);
2081 *SOffset = SDValue(
2082 CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, C32Bit), 0);
2083 return true;
2084 }
2085
2086 return false;
2087}
2088
2089SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const {
2090 if (Addr.getValueType() != MVT::i32)
2091 return Addr;
2092
2093 // Zero-extend a 32-bit address.
2094 SDLoc SL(Addr);
2095
2098 unsigned AddrHiVal = Info->get32BitAddressHighBits();
2099 SDValue AddrHi = CurDAG->getTargetConstant(AddrHiVal, SL, MVT::i32);
2100
2101 const SDValue Ops[] = {
2102 CurDAG->getTargetConstant(AMDGPU::SReg_64_XEXECRegClassID, SL, MVT::i32),
2103 Addr,
2104 CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
2105 SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, AddrHi),
2106 0),
2107 CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32),
2108 };
2109
2110 return SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, SL, MVT::i64,
2111 Ops), 0);
2112}
2113
2114// Match a base and an immediate (if Offset is not null) or an SGPR (if
2115// SOffset is not null) or an immediate+SGPR offset. If Imm32Only is
2116// true, match only 32-bit immediate offsets available on CI.
2117bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase,
2118 SDValue *SOffset, SDValue *Offset,
2119 bool Imm32Only, bool IsBuffer,
2120 bool HasSOffset,
2121 int64_t ImmOffset) const {
2122 if (SOffset && Offset) {
2123 assert(!Imm32Only && !IsBuffer);
2124 SDValue B;
2125
2126 if (!SelectSMRDBaseOffset(Addr, B, nullptr, Offset, false, false, true))
2127 return false;
2128
2129 int64_t ImmOff = 0;
2130 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(*Offset))
2131 ImmOff = C->getSExtValue();
2132
2133 return SelectSMRDBaseOffset(B, SBase, SOffset, nullptr, false, false, true,
2134 ImmOff);
2135 }
2136
2137 // A 32-bit (address + offset) should not cause unsigned 32-bit integer
2138 // wraparound, because s_load instructions perform the addition in 64 bits.
2139 if (Addr.getValueType() == MVT::i32 && Addr.getOpcode() == ISD::ADD &&
2140 !Addr->getFlags().hasNoUnsignedWrap())
2141 return false;
2142
2143 SDValue N0, N1;
2144 // Extract the base and offset if possible.
2145 if (CurDAG->isBaseWithConstantOffset(Addr) || Addr.getOpcode() == ISD::ADD) {
2146 N0 = Addr.getOperand(0);
2147 N1 = Addr.getOperand(1);
2148 } else if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, N0, N1)) {
2149 assert(N0 && N1 && isa<ConstantSDNode>(N1));
2150 }
2151 if (!N0 || !N1)
2152 return false;
2153
2154 if (SelectSMRDOffset(N1, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
2155 ImmOffset)) {
2156 SBase = N0;
2157 return true;
2158 }
2159 if (SelectSMRDOffset(N0, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
2160 ImmOffset)) {
2161 SBase = N1;
2162 return true;
2163 }
2164 return false;
2165}
2166
2167bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase,
2168 SDValue *SOffset, SDValue *Offset,
2169 bool Imm32Only) const {
2170 if (SelectSMRDBaseOffset(Addr, SBase, SOffset, Offset, Imm32Only)) {
2171 SBase = Expand32BitAddress(SBase);
2172 return true;
2173 }
2174
2175 if (Addr.getValueType() == MVT::i32 && Offset && !SOffset) {
2176 SBase = Expand32BitAddress(Addr);
2177 *Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
2178 return true;
2179 }
2180
2181 return false;
2182}
2183
2184bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr, SDValue &SBase,
2185 SDValue &Offset) const {
2186 return SelectSMRD(Addr, SBase, /* SOffset */ nullptr, &Offset);
2187}
2188
2189bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase,
2190 SDValue &Offset) const {
2192 return SelectSMRD(Addr, SBase, /* SOffset */ nullptr, &Offset,
2193 /* Imm32Only */ true);
2194}
2195
2196bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDValue Addr, SDValue &SBase,
2197 SDValue &SOffset) const {
2198 return SelectSMRD(Addr, SBase, &SOffset, /* Offset */ nullptr);
2199}
2200
2201bool AMDGPUDAGToDAGISel::SelectSMRDSgprImm(SDValue Addr, SDValue &SBase,
2202 SDValue &SOffset,
2203 SDValue &Offset) const {
2204 return SelectSMRD(Addr, SBase, &SOffset, &Offset);
2205}
2206
2207bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue N, SDValue &Offset) const {
2208 return SelectSMRDOffset(N, /* SOffset */ nullptr, &Offset,
2209 /* Imm32Only */ false, /* IsBuffer */ true);
2210}
2211
2212bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue N,
2213 SDValue &Offset) const {
2215 return SelectSMRDOffset(N, /* SOffset */ nullptr, &Offset,
2216 /* Imm32Only */ true, /* IsBuffer */ true);
2217}
2218
2219bool AMDGPUDAGToDAGISel::SelectSMRDBufferSgprImm(SDValue N, SDValue &SOffset,
2220 SDValue &Offset) const {
2221 // Match the (soffset + offset) pair as a 32-bit register base and
2222 // an immediate offset.
2223 return N.getValueType() == MVT::i32 &&
2224 SelectSMRDBaseOffset(N, /* SBase */ SOffset, /* SOffset*/ nullptr,
2225 &Offset, /* Imm32Only */ false,
2226 /* IsBuffer */ true);
2227}
2228
2229bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index,
2230 SDValue &Base,
2231 SDValue &Offset) const {
2232 SDLoc DL(Index);
2233
2234 if (CurDAG->isBaseWithConstantOffset(Index)) {
2235 SDValue N0 = Index.getOperand(0);
2236 SDValue N1 = Index.getOperand(1);
2237 ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
2238
2239 // (add n0, c0)
2240 // Don't peel off the offset (c0) if doing so could possibly lead
2241 // the base (n0) to be negative.
2242 // (or n0, |c0|) can never change a sign given isBaseWithConstantOffset.
2243 if (C1->getSExtValue() <= 0 || CurDAG->SignBitIsZero(N0) ||
2244 (Index->getOpcode() == ISD::OR && C1->getSExtValue() >= 0)) {
2245 Base = N0;
2246 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
2247 return true;
2248 }
2249 }
2250
2251 if (isa<ConstantSDNode>(Index))
2252 return false;
2253
2254 Base = Index;
2255 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
2256 return true;
2257}
2258
2259SDNode *AMDGPUDAGToDAGISel::getBFE32(bool IsSigned, const SDLoc &DL,
2260 SDValue Val, uint32_t Offset,
2261 uint32_t Width) {
2262 if (Val->isDivergent()) {
2263 unsigned Opcode = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2265 SDValue W = CurDAG->getTargetConstant(Width, DL, MVT::i32);
2266
2267 return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, Off, W);
2268 }
2269 unsigned Opcode = IsSigned ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2270 // Transformation function, pack the offset and width of a BFE into
2271 // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
2272 // source, bits [5:0] contain the offset and bits [22:16] the width.
2273 uint32_t PackedVal = Offset | (Width << 16);
2274 SDValue PackedConst = CurDAG->getTargetConstant(PackedVal, DL, MVT::i32);
2275
2276 return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, PackedConst);
2277}
2278
2279void AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode *N) {
2280 // "(a << b) srl c)" ---> "BFE_U32 a, (c-b), (32-c)
2281 // "(a << b) sra c)" ---> "BFE_I32 a, (c-b), (32-c)
2282 // Predicate: 0 < b <= c < 32
2283
2284 const SDValue &Shl = N->getOperand(0);
2285 ConstantSDNode *B = dyn_cast<ConstantSDNode>(Shl->getOperand(1));
2286 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
2287
2288 if (B && C) {
2289 uint32_t BVal = B->getZExtValue();
2290 uint32_t CVal = C->getZExtValue();
2291
2292 if (0 < BVal && BVal <= CVal && CVal < 32) {
2293 bool Signed = N->getOpcode() == ISD::SRA;
2294 ReplaceNode(N, getBFE32(Signed, SDLoc(N), Shl.getOperand(0), CVal - BVal,
2295 32 - CVal));
2296 return;
2297 }
2298 }
2299 SelectCode(N);
2300}
2301
2302void AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) {
2303 switch (N->getOpcode()) {
2304 case ISD::AND:
2305 if (N->getOperand(0).getOpcode() == ISD::SRL) {
2306 // "(a srl b) & mask" ---> "BFE_U32 a, b, popcount(mask)"
2307 // Predicate: isMask(mask)
2308 const SDValue &Srl = N->getOperand(0);
2309 ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(Srl.getOperand(1));
2310 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(N->getOperand(1));
2311
2312 if (Shift && Mask) {
2313 uint32_t ShiftVal = Shift->getZExtValue();
2314 uint32_t MaskVal = Mask->getZExtValue();
2315
2316 if (isMask_32(MaskVal)) {
2317 uint32_t WidthVal = llvm::popcount(MaskVal);
2318 ReplaceNode(N, getBFE32(false, SDLoc(N), Srl.getOperand(0), ShiftVal,
2319 WidthVal));
2320 return;
2321 }
2322 }
2323 }
2324 break;
2325 case ISD::SRL:
2326 if (N->getOperand(0).getOpcode() == ISD::AND) {
2327 // "(a & mask) srl b)" ---> "BFE_U32 a, b, popcount(mask >> b)"
2328 // Predicate: isMask(mask >> b)
2329 const SDValue &And = N->getOperand(0);
2330 ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(N->getOperand(1));
2331 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(And->getOperand(1));
2332
2333 if (Shift && Mask) {
2334 uint32_t ShiftVal = Shift->getZExtValue();
2335 uint32_t MaskVal = Mask->getZExtValue() >> ShiftVal;
2336
2337 if (isMask_32(MaskVal)) {
2338 uint32_t WidthVal = llvm::popcount(MaskVal);
2339 ReplaceNode(N, getBFE32(false, SDLoc(N), And.getOperand(0), ShiftVal,
2340 WidthVal));
2341 return;
2342 }
2343 }
2344 } else if (N->getOperand(0).getOpcode() == ISD::SHL) {
2345 SelectS_BFEFromShifts(N);
2346 return;
2347 }
2348 break;
2349 case ISD::SRA:
2350 if (N->getOperand(0).getOpcode() == ISD::SHL) {
2351 SelectS_BFEFromShifts(N);
2352 return;
2353 }
2354 break;
2355
2357 // sext_inreg (srl x, 16), i8 -> bfe_i32 x, 16, 8
2358 SDValue Src = N->getOperand(0);
2359 if (Src.getOpcode() != ISD::SRL)
2360 break;
2361
2362 const ConstantSDNode *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(1));
2363 if (!Amt)
2364 break;
2365
2366 unsigned Width = cast<VTSDNode>(N->getOperand(1))->getVT().getSizeInBits();
2367 ReplaceNode(N, getBFE32(true, SDLoc(N), Src.getOperand(0),
2368 Amt->getZExtValue(), Width));
2369 return;
2370 }
2371 }
2372
2373 SelectCode(N);
2374}
2375
2376bool AMDGPUDAGToDAGISel::isCBranchSCC(const SDNode *N) const {
2377 assert(N->getOpcode() == ISD::BRCOND);
2378 if (!N->hasOneUse())
2379 return false;
2380
2381 SDValue Cond = N->getOperand(1);
2382 if (Cond.getOpcode() == ISD::CopyToReg)
2383 Cond = Cond.getOperand(2);
2384
2385 if (Cond.getOpcode() != ISD::SETCC || !Cond.hasOneUse())
2386 return false;
2387
2388 MVT VT = Cond.getOperand(0).getSimpleValueType();
2389 if (VT == MVT::i32)
2390 return true;
2391
2392 if (VT == MVT::i64) {
2393 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
2394 return (CC == ISD::SETEQ || CC == ISD::SETNE) &&
2395 Subtarget->hasScalarCompareEq64();
2396 }
2397
2398 if ((VT == MVT::f16 || VT == MVT::f32) && Subtarget->hasSALUFloatInsts())
2399 return true;
2400
2401 return false;
2402}
2403
2404static SDValue combineBallotPattern(SDValue VCMP, bool &Negate) {
2405 assert(VCMP->getOpcode() == AMDGPUISD::SETCC);
2406 // Special case for amdgcn.ballot:
2407 // %Cond = i1 (and/or combination of i1 ISD::SETCCs)
2408 // %VCMP = i(WaveSize) AMDGPUISD::SETCC (ext %Cond), 0, setne/seteq
2409 // =>
2410 // Use i1 %Cond value instead of i(WaveSize) %VCMP.
2411 // This is possible because divergent ISD::SETCC is selected as V_CMP and
2412 // Cond becomes a i(WaveSize) full mask value.
2413 // Note that ballot doesn't use SETEQ condition but its easy to support it
2414 // here for completeness, so in this case Negate is set true on return.
2415 auto VCMP_CC = cast<CondCodeSDNode>(VCMP.getOperand(2))->get();
2416 if ((VCMP_CC == ISD::SETEQ || VCMP_CC == ISD::SETNE) &&
2417 isNullConstant(VCMP.getOperand(1))) {
2418
2419 auto Cond = VCMP.getOperand(0);
2420 if (ISD::isExtOpcode(Cond->getOpcode())) // Skip extension.
2421 Cond = Cond.getOperand(0);
2422
2423 if (isBoolSGPR(Cond)) {
2424 Negate = VCMP_CC == ISD::SETEQ;
2425 return Cond;
2426 }
2427 }
2428 return SDValue();
2429}
2430
2431void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
2432 SDValue Cond = N->getOperand(1);
2433
2434 if (Cond.isUndef()) {
2435 CurDAG->SelectNodeTo(N, AMDGPU::SI_BR_UNDEF, MVT::Other,
2436 N->getOperand(2), N->getOperand(0));
2437 return;
2438 }
2439
2440 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
2441
2442 bool UseSCCBr = isCBranchSCC(N) && isUniformBr(N);
2443 bool AndExec = !UseSCCBr;
2444 bool Negate = false;
2445
2446 if (Cond.getOpcode() == ISD::SETCC &&
2447 Cond->getOperand(0)->getOpcode() == AMDGPUISD::SETCC) {
2448 SDValue VCMP = Cond->getOperand(0);
2449 auto CC = cast<CondCodeSDNode>(Cond->getOperand(2))->get();
2450 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
2451 isNullConstant(Cond->getOperand(1)) &&
2452 // We may encounter ballot.i64 in wave32 mode on -O0.
2453 VCMP.getValueType().getSizeInBits() == Subtarget->getWavefrontSize()) {
2454 // %VCMP = i(WaveSize) AMDGPUISD::SETCC ...
2455 // %C = i1 ISD::SETCC %VCMP, 0, setne/seteq
2456 // BRCOND i1 %C, %BB
2457 // =>
2458 // %VCMP = i(WaveSize) AMDGPUISD::SETCC ...
2459 // VCC = COPY i(WaveSize) %VCMP
2460 // S_CBRANCH_VCCNZ/VCCZ %BB
2461 Negate = CC == ISD::SETEQ;
2462 bool NegatedBallot = false;
2463 if (auto BallotCond = combineBallotPattern(VCMP, NegatedBallot)) {
2464 Cond = BallotCond;
2465 UseSCCBr = !BallotCond->isDivergent();
2466 Negate = Negate ^ NegatedBallot;
2467 } else {
2468 // TODO: don't use SCC here assuming that AMDGPUISD::SETCC is always
2469 // selected as V_CMP, but this may change for uniform condition.
2470 Cond = VCMP;
2471 UseSCCBr = false;
2472 }
2473 }
2474 // Cond is either V_CMP resulted from AMDGPUISD::SETCC or a combination of
2475 // V_CMPs resulted from ballot or ballot has uniform condition and SCC is
2476 // used.
2477 AndExec = false;
2478 }
2479
2480 unsigned BrOp =
2481 UseSCCBr ? (Negate ? AMDGPU::S_CBRANCH_SCC0 : AMDGPU::S_CBRANCH_SCC1)
2482 : (Negate ? AMDGPU::S_CBRANCH_VCCZ : AMDGPU::S_CBRANCH_VCCNZ);
2483 Register CondReg = UseSCCBr ? AMDGPU::SCC : TRI->getVCC();
2484 SDLoc SL(N);
2485
2486 if (AndExec) {
2487 // This is the case that we are selecting to S_CBRANCH_VCCNZ. We have not
2488 // analyzed what generates the vcc value, so we do not know whether vcc
2489 // bits for disabled lanes are 0. Thus we need to mask out bits for
2490 // disabled lanes.
2491 //
2492 // For the case that we select S_CBRANCH_SCC1 and it gets
2493 // changed to S_CBRANCH_VCCNZ in SIFixSGPRCopies, SIFixSGPRCopies calls
2494 // SIInstrInfo::moveToVALU which inserts the S_AND).
2495 //
2496 // We could add an analysis of what generates the vcc value here and omit
2497 // the S_AND when is unnecessary. But it would be better to add a separate
2498 // pass after SIFixSGPRCopies to do the unnecessary S_AND removal, so it
2499 // catches both cases.
2500 Cond = SDValue(
2502 Subtarget->isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64, SL,
2503 MVT::i1,
2504 CurDAG->getRegister(Subtarget->isWave32() ? AMDGPU::EXEC_LO
2505 : AMDGPU::EXEC,
2506 MVT::i1),
2507 Cond),
2508 0);
2509 }
2510
2511 SDValue VCC = CurDAG->getCopyToReg(N->getOperand(0), SL, CondReg, Cond);
2512 CurDAG->SelectNodeTo(N, BrOp, MVT::Other,
2513 N->getOperand(2), // Basic Block
2514 VCC.getValue(0));
2515}
2516
2517void AMDGPUDAGToDAGISel::SelectFP_EXTEND(SDNode *N) {
2518 if (Subtarget->hasSALUFloatInsts() && N->getValueType(0) == MVT::f32 &&
2519 !N->isDivergent()) {
2520 SDValue Src = N->getOperand(0);
2521 if (Src.getValueType() == MVT::f16) {
2522 if (isExtractHiElt(Src, Src)) {
2523 CurDAG->SelectNodeTo(N, AMDGPU::S_CVT_HI_F32_F16, N->getVTList(),
2524 {Src});
2525 return;
2526 }
2527 }
2528 }
2529
2530 SelectCode(N);
2531}
2532
2533void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode *N, unsigned IntrID) {
2534 // The address is assumed to be uniform, so if it ends up in a VGPR, it will
2535 // be copied to an SGPR with readfirstlane.
2536 unsigned Opc = IntrID == Intrinsic::amdgcn_ds_append ?
2537 AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
2538
2539 SDValue Chain = N->getOperand(0);
2540 SDValue Ptr = N->getOperand(2);
2541 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2542 MachineMemOperand *MMO = M->getMemOperand();
2543 bool IsGDS = M->getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
2544
2547 SDValue PtrBase = Ptr.getOperand(0);
2548 SDValue PtrOffset = Ptr.getOperand(1);
2549
2550 const APInt &OffsetVal = PtrOffset->getAsAPIntVal();
2551 if (isDSOffsetLegal(PtrBase, OffsetVal.getZExtValue())) {
2552 N = glueCopyToM0(N, PtrBase);
2553 Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i32);
2554 }
2555 }
2556
2557 if (!Offset) {
2558 N = glueCopyToM0(N, Ptr);
2559 Offset = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32);
2560 }
2561
2562 SDValue Ops[] = {
2563 Offset,
2564 CurDAG->getTargetConstant(IsGDS, SDLoc(), MVT::i32),
2565 Chain,
2566 N->getOperand(N->getNumOperands() - 1) // New glue
2567 };
2568
2569 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
2570 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
2571}
2572
2573// We need to handle this here because tablegen doesn't support matching
2574// instructions with multiple outputs.
2575void AMDGPUDAGToDAGISel::SelectDSBvhStackIntrinsic(SDNode *N) {
2576 unsigned Opc = AMDGPU::DS_BVH_STACK_RTN_B32;
2577 SDValue Ops[] = {N->getOperand(2), N->getOperand(3), N->getOperand(4),
2578 N->getOperand(5), N->getOperand(0)};
2579
2580 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2581 MachineMemOperand *MMO = M->getMemOperand();
2582 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
2583 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
2584}
2585
2586static unsigned gwsIntrinToOpcode(unsigned IntrID) {
2587 switch (IntrID) {
2588 case Intrinsic::amdgcn_ds_gws_init:
2589 return AMDGPU::DS_GWS_INIT;
2590 case Intrinsic::amdgcn_ds_gws_barrier:
2591 return AMDGPU::DS_GWS_BARRIER;
2592 case Intrinsic::amdgcn_ds_gws_sema_v:
2593 return AMDGPU::DS_GWS_SEMA_V;
2594 case Intrinsic::amdgcn_ds_gws_sema_br:
2595 return AMDGPU::DS_GWS_SEMA_BR;
2596 case Intrinsic::amdgcn_ds_gws_sema_p:
2597 return AMDGPU::DS_GWS_SEMA_P;
2598 case Intrinsic::amdgcn_ds_gws_sema_release_all:
2599 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
2600 default:
2601 llvm_unreachable("not a gws intrinsic");
2602 }
2603}
2604
2605void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) {
2606 if (!Subtarget->hasGWS() ||
2607 (IntrID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
2608 !Subtarget->hasGWSSemaReleaseAll())) {
2609 // Let this error.
2610 SelectCode(N);
2611 return;
2612 }
2613
2614 // Chain, intrinsic ID, vsrc, offset
2615 const bool HasVSrc = N->getNumOperands() == 4;
2616 assert(HasVSrc || N->getNumOperands() == 3);
2617
2618 SDLoc SL(N);
2619 SDValue BaseOffset = N->getOperand(HasVSrc ? 3 : 2);
2620 int ImmOffset = 0;
2621 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2622 MachineMemOperand *MMO = M->getMemOperand();
2623
2624 // Don't worry if the offset ends up in a VGPR. Only one lane will have
2625 // effect, so SIFixSGPRCopies will validly insert readfirstlane.
2626
2627 // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
2628 // offset field) % 64. Some versions of the programming guide omit the m0
2629 // part, or claim it's from offset 0.
2630 if (ConstantSDNode *ConstOffset = dyn_cast<ConstantSDNode>(BaseOffset)) {
2631 // If we have a constant offset, try to use the 0 in m0 as the base.
2632 // TODO: Look into changing the default m0 initialization value. If the
2633 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
2634 // the immediate offset.
2635 glueCopyToM0(N, CurDAG->getTargetConstant(0, SL, MVT::i32));
2636 ImmOffset = ConstOffset->getZExtValue();
2637 } else {
2638 if (CurDAG->isBaseWithConstantOffset(BaseOffset)) {
2639 ImmOffset = BaseOffset.getConstantOperandVal(1);
2640 BaseOffset = BaseOffset.getOperand(0);
2641 }
2642
2643 // Prefer to do the shift in an SGPR since it should be possible to use m0
2644 // as the result directly. If it's already an SGPR, it will be eliminated
2645 // later.
2646 SDNode *SGPROffset
2647 = CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL, MVT::i32,
2648 BaseOffset);
2649 // Shift to offset in m0
2650 SDNode *M0Base
2651 = CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,
2652 SDValue(SGPROffset, 0),
2653 CurDAG->getTargetConstant(16, SL, MVT::i32));
2654 glueCopyToM0(N, SDValue(M0Base, 0));
2655 }
2656
2657 SDValue Chain = N->getOperand(0);
2658 SDValue OffsetField = CurDAG->getTargetConstant(ImmOffset, SL, MVT::i32);
2659
2660 const unsigned Opc = gwsIntrinToOpcode(IntrID);
2662 if (HasVSrc)
2663 Ops.push_back(N->getOperand(2));
2664 Ops.push_back(OffsetField);
2665 Ops.push_back(Chain);
2666
2667 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
2668 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
2669}
2670
2671void AMDGPUDAGToDAGISel::SelectInterpP1F16(SDNode *N) {
2672 if (Subtarget->getLDSBankCount() != 16) {
2673 // This is a single instruction with a pattern.
2674 SelectCode(N);
2675 return;
2676 }
2677
2678 SDLoc DL(N);
2679
2680 // This requires 2 instructions. It is possible to write a pattern to support
2681 // this, but the generated isel emitter doesn't correctly deal with multiple
2682 // output instructions using the same physical register input. The copy to m0
2683 // is incorrectly placed before the second instruction.
2684 //
2685 // TODO: Match source modifiers.
2686 //
2687 // def : Pat <
2688 // (int_amdgcn_interp_p1_f16
2689 // (VOP3Mods f32:$src0, i32:$src0_modifiers),
2690 // (i32 timm:$attrchan), (i32 timm:$attr),
2691 // (i1 timm:$high), M0),
2692 // (V_INTERP_P1LV_F16 $src0_modifiers, VGPR_32:$src0, timm:$attr,
2693 // timm:$attrchan, 0,
2694 // (V_INTERP_MOV_F32 2, timm:$attr, timm:$attrchan), timm:$high)> {
2695 // let Predicates = [has16BankLDS];
2696 // }
2697
2698 // 16 bank LDS
2699 SDValue ToM0 = CurDAG->getCopyToReg(CurDAG->getEntryNode(), DL, AMDGPU::M0,
2700 N->getOperand(5), SDValue());
2701
2702 SDVTList VTs = CurDAG->getVTList(MVT::f32, MVT::Other);
2703
2704 SDNode *InterpMov =
2705 CurDAG->getMachineNode(AMDGPU::V_INTERP_MOV_F32, DL, VTs, {
2706 CurDAG->getTargetConstant(2, DL, MVT::i32), // P0
2707 N->getOperand(3), // Attr
2708 N->getOperand(2), // Attrchan
2709 ToM0.getValue(1) // In glue
2710 });
2711
2712 SDNode *InterpP1LV =
2713 CurDAG->getMachineNode(AMDGPU::V_INTERP_P1LV_F16, DL, MVT::f32, {
2714 CurDAG->getTargetConstant(0, DL, MVT::i32), // $src0_modifiers
2715 N->getOperand(1), // Src0
2716 N->getOperand(3), // Attr
2717 N->getOperand(2), // Attrchan
2718 CurDAG->getTargetConstant(0, DL, MVT::i32), // $src2_modifiers
2719 SDValue(InterpMov, 0), // Src2 - holds two f16 values selected by high
2720 N->getOperand(4), // high
2721 CurDAG->getTargetConstant(0, DL, MVT::i1), // $clamp
2722 CurDAG->getTargetConstant(0, DL, MVT::i32), // $omod
2723 SDValue(InterpMov, 1)
2724 });
2725
2726 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), SDValue(InterpP1LV, 0));
2727}
2728
2729void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
2730 unsigned IntrID = N->getConstantOperandVal(1);
2731 switch (IntrID) {
2732 case Intrinsic::amdgcn_ds_append:
2733 case Intrinsic::amdgcn_ds_consume: {
2734 if (N->getValueType(0) != MVT::i32)
2735 break;
2736 SelectDSAppendConsume(N, IntrID);
2737 return;
2738 }
2739 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2740 SelectDSBvhStackIntrinsic(N);
2741 return;
2742 case Intrinsic::amdgcn_init_whole_wave:
2745 ->setInitWholeWave();
2746 break;
2747 }
2748
2749 SelectCode(N);
2750}
2751
2752void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) {
2753 unsigned IntrID = N->getConstantOperandVal(0);
2754 unsigned Opcode = AMDGPU::INSTRUCTION_LIST_END;
2755 SDNode *ConvGlueNode = N->getGluedNode();
2756 if (ConvGlueNode) {
2757 // FIXME: Possibly iterate over multiple glue nodes?
2758 assert(ConvGlueNode->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
2759 ConvGlueNode = ConvGlueNode->getOperand(0).getNode();
2760 ConvGlueNode =
2761 CurDAG->getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, {},
2762 MVT::Glue, SDValue(ConvGlueNode, 0));
2763 } else {
2764 ConvGlueNode = nullptr;
2765 }
2766 switch (IntrID) {
2767 case Intrinsic::amdgcn_wqm:
2768 Opcode = AMDGPU::WQM;
2769 break;
2770 case Intrinsic::amdgcn_softwqm:
2771 Opcode = AMDGPU::SOFT_WQM;
2772 break;
2773 case Intrinsic::amdgcn_wwm:
2774 case Intrinsic::amdgcn_strict_wwm:
2775 Opcode = AMDGPU::STRICT_WWM;
2776 break;
2777 case Intrinsic::amdgcn_strict_wqm:
2778 Opcode = AMDGPU::STRICT_WQM;
2779 break;
2780 case Intrinsic::amdgcn_interp_p1_f16:
2781 SelectInterpP1F16(N);
2782 return;
2783 case Intrinsic::amdgcn_permlane16_swap:
2784 case Intrinsic::amdgcn_permlane32_swap: {
2785 if ((IntrID == Intrinsic::amdgcn_permlane16_swap &&
2786 !Subtarget->hasPermlane16Swap()) ||
2787 (IntrID == Intrinsic::amdgcn_permlane32_swap &&
2788 !Subtarget->hasPermlane32Swap())) {
2789 SelectCode(N); // Hit the default error
2790 return;
2791 }
2792
2793 Opcode = IntrID == Intrinsic::amdgcn_permlane16_swap
2794 ? AMDGPU::V_PERMLANE16_SWAP_B32_e64
2795 : AMDGPU::V_PERMLANE32_SWAP_B32_e64;
2796
2797 SmallVector<SDValue, 4> NewOps(N->op_begin() + 1, N->op_end());
2798 if (ConvGlueNode)
2799 NewOps.push_back(SDValue(ConvGlueNode, 0));
2800
2801 bool FI = N->getConstantOperandVal(3);
2802 NewOps[2] = CurDAG->getTargetConstant(
2804
2805 CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), NewOps);
2806 return;
2807 }
2808 default:
2809 SelectCode(N);
2810 break;
2811 }
2812
2813 if (Opcode != AMDGPU::INSTRUCTION_LIST_END) {
2814 SDValue Src = N->getOperand(1);
2815 CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), {Src});
2816 }
2817
2818 if (ConvGlueNode) {
2819 SmallVector<SDValue, 4> NewOps(N->ops());
2820 NewOps.push_back(SDValue(ConvGlueNode, 0));
2821 CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), NewOps);
2822 }
2823}
2824
2825void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) {
2826 unsigned IntrID = N->getConstantOperandVal(1);
2827 switch (IntrID) {
2828 case Intrinsic::amdgcn_ds_gws_init:
2829 case Intrinsic::amdgcn_ds_gws_barrier:
2830 case Intrinsic::amdgcn_ds_gws_sema_v:
2831 case Intrinsic::amdgcn_ds_gws_sema_br:
2832 case Intrinsic::amdgcn_ds_gws_sema_p:
2833 case Intrinsic::amdgcn_ds_gws_sema_release_all:
2834 SelectDS_GWS(N, IntrID);
2835 return;
2836 default:
2837 break;
2838 }
2839
2840 SelectCode(N);
2841}
2842
2843void AMDGPUDAGToDAGISel::SelectWAVE_ADDRESS(SDNode *N) {
2844 SDValue Log2WaveSize =
2845 CurDAG->getTargetConstant(Subtarget->getWavefrontSizeLog2(), SDLoc(N), MVT::i32);
2846 CurDAG->SelectNodeTo(N, AMDGPU::S_LSHR_B32, N->getVTList(),
2847 {N->getOperand(0), Log2WaveSize});
2848}
2849
2850void AMDGPUDAGToDAGISel::SelectSTACKRESTORE(SDNode *N) {
2851 SDValue SrcVal = N->getOperand(1);
2852 if (SrcVal.getValueType() != MVT::i32) {
2853 SelectCode(N); // Emit default error
2854 return;
2855 }
2856
2857 SDValue CopyVal;
2859 SDLoc SL(N);
2860
2861 if (SrcVal.getOpcode() == AMDGPUISD::WAVE_ADDRESS) {
2862 CopyVal = SrcVal.getOperand(0);
2863 } else {
2864 SDValue Log2WaveSize = CurDAG->getTargetConstant(
2865 Subtarget->getWavefrontSizeLog2(), SL, MVT::i32);
2866
2867 if (N->isDivergent()) {
2868 SrcVal = SDValue(CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL,
2869 MVT::i32, SrcVal),
2870 0);
2871 }
2872
2873 CopyVal = SDValue(CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,
2874 {SrcVal, Log2WaveSize}),
2875 0);
2876 }
2877
2878 SDValue CopyToSP = CurDAG->getCopyToReg(N->getOperand(0), SL, SP, CopyVal);
2879 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), CopyToSP);
2880}
2881
2882bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src,
2883 unsigned &Mods,
2884 bool IsCanonicalizing,
2885 bool AllowAbs) const {
2886 Mods = SISrcMods::NONE;
2887 Src = In;
2888
2889 if (Src.getOpcode() == ISD::FNEG) {
2890 Mods |= SISrcMods::NEG;
2891 Src = Src.getOperand(0);
2892 } else if (Src.getOpcode() == ISD::FSUB && IsCanonicalizing) {
2893 // Fold fsub [+-]0 into fneg. This may not have folded depending on the
2894 // denormal mode, but we're implicitly canonicalizing in a source operand.
2895 auto *LHS = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
2896 if (LHS && LHS->isZero()) {
2897 Mods |= SISrcMods::NEG;
2898 Src = Src.getOperand(1);
2899 }
2900 }
2901
2902 if (AllowAbs && Src.getOpcode() == ISD::FABS) {
2903 Mods |= SISrcMods::ABS;
2904 Src = Src.getOperand(0);
2905 }
2906
2907 return true;
2908}
2909
2910bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src,
2911 SDValue &SrcMods) const {
2912 unsigned Mods;
2913 if (SelectVOP3ModsImpl(In, Src, Mods, /*IsCanonicalizing=*/true,
2914 /*AllowAbs=*/true)) {
2915 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2916 return true;
2917 }
2918
2919 return false;
2920}
2921
2922bool AMDGPUDAGToDAGISel::SelectVOP3ModsNonCanonicalizing(
2923 SDValue In, SDValue &Src, SDValue &SrcMods) const {
2924 unsigned Mods;
2925 if (SelectVOP3ModsImpl(In, Src, Mods, /*IsCanonicalizing=*/false,
2926 /*AllowAbs=*/true)) {
2927 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2928 return true;
2929 }
2930
2931 return false;
2932}
2933
2934bool AMDGPUDAGToDAGISel::SelectVOP3BMods(SDValue In, SDValue &Src,
2935 SDValue &SrcMods) const {
2936 unsigned Mods;
2937 if (SelectVOP3ModsImpl(In, Src, Mods,
2938 /*IsCanonicalizing=*/true,
2939 /*AllowAbs=*/false)) {
2940 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2941 return true;
2942 }
2943
2944 return false;
2945}
2946
2947bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src) const {
2948 if (In.getOpcode() == ISD::FABS || In.getOpcode() == ISD::FNEG)
2949 return false;
2950
2951 Src = In;
2952 return true;
2953}
2954
2955bool AMDGPUDAGToDAGISel::SelectVINTERPModsImpl(SDValue In, SDValue &Src,
2956 SDValue &SrcMods,
2957 bool OpSel) const {
2958 unsigned Mods;
2959 if (SelectVOP3ModsImpl(In, Src, Mods,
2960 /*IsCanonicalizing=*/true,
2961 /*AllowAbs=*/false)) {
2962 if (OpSel)
2963 Mods |= SISrcMods::OP_SEL_0;
2964 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2965 return true;
2966 }
2967
2968 return false;
2969}
2970
2971bool AMDGPUDAGToDAGISel::SelectVINTERPMods(SDValue In, SDValue &Src,
2972 SDValue &SrcMods) const {
2973 return SelectVINTERPModsImpl(In, Src, SrcMods, /* OpSel */ false);
2974}
2975
2976bool AMDGPUDAGToDAGISel::SelectVINTERPModsHi(SDValue In, SDValue &Src,
2977 SDValue &SrcMods) const {
2978 return SelectVINTERPModsImpl(In, Src, SrcMods, /* OpSel */ true);
2979}
2980
2981bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src,
2982 SDValue &SrcMods, SDValue &Clamp,
2983 SDValue &Omod) const {
2984 SDLoc DL(In);
2985 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
2986 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
2987
2988 return SelectVOP3Mods(In, Src, SrcMods);
2989}
2990
2991bool AMDGPUDAGToDAGISel::SelectVOP3BMods0(SDValue In, SDValue &Src,
2992 SDValue &SrcMods, SDValue &Clamp,
2993 SDValue &Omod) const {
2994 SDLoc DL(In);
2995 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
2996 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
2997
2998 return SelectVOP3BMods(In, Src, SrcMods);
2999}
3000
3001bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In, SDValue &Src,
3002 SDValue &Clamp, SDValue &Omod) const {
3003 Src = In;
3004
3005 SDLoc DL(In);
3006 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
3007 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
3008
3009 return true;
3010}
3011
3012bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
3013 SDValue &SrcMods, bool IsDOT) const {
3014 unsigned Mods = SISrcMods::NONE;
3015 Src = In;
3016
3017 // TODO: Handle G_FSUB 0 as fneg
3018 if (Src.getOpcode() == ISD::FNEG) {
3020 Src = Src.getOperand(0);
3021 }
3022
3023 if (Src.getOpcode() == ISD::BUILD_VECTOR && Src.getNumOperands() == 2 &&
3024 (!IsDOT || !Subtarget->hasDOTOpSelHazard())) {
3025 unsigned VecMods = Mods;
3026
3027 SDValue Lo = stripBitcast(Src.getOperand(0));
3028 SDValue Hi = stripBitcast(Src.getOperand(1));
3029
3030 if (Lo.getOpcode() == ISD::FNEG) {
3031 Lo = stripBitcast(Lo.getOperand(0));
3032 Mods ^= SISrcMods::NEG;
3033 }
3034
3035 if (Hi.getOpcode() == ISD::FNEG) {
3036 Hi = stripBitcast(Hi.getOperand(0));
3037 Mods ^= SISrcMods::NEG_HI;
3038 }
3039
3040 if (isExtractHiElt(Lo, Lo))
3041 Mods |= SISrcMods::OP_SEL_0;
3042
3043 if (isExtractHiElt(Hi, Hi))
3044 Mods |= SISrcMods::OP_SEL_1;
3045
3046 unsigned VecSize = Src.getValueSizeInBits();
3047 Lo = stripExtractLoElt(Lo);
3048 Hi = stripExtractLoElt(Hi);
3049
3050 if (Lo.getValueSizeInBits() > VecSize) {
3052 (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
3053 MVT::getIntegerVT(VecSize), Lo);
3054 }
3055
3056 if (Hi.getValueSizeInBits() > VecSize) {
3058 (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
3059 MVT::getIntegerVT(VecSize), Hi);
3060 }
3061
3062 assert(Lo.getValueSizeInBits() <= VecSize &&
3063 Hi.getValueSizeInBits() <= VecSize);
3064
3065 if (Lo == Hi && !isInlineImmediate(Lo.getNode())) {
3066 // Really a scalar input. Just select from the low half of the register to
3067 // avoid packing.
3068
3069 if (VecSize == 32 || VecSize == Lo.getValueSizeInBits()) {
3070 Src = Lo;
3071 } else {
3072 assert(Lo.getValueSizeInBits() == 32 && VecSize == 64);
3073
3074 SDLoc SL(In);
3076 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, SL,
3077 Lo.getValueType()), 0);
3078 auto RC = Lo->isDivergent() ? AMDGPU::VReg_64RegClassID
3079 : AMDGPU::SReg_64RegClassID;
3080 const SDValue Ops[] = {
3081 CurDAG->getTargetConstant(RC, SL, MVT::i32),
3082 Lo, CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
3083 Undef, CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32) };
3084
3085 Src = SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, SL,
3086 Src.getValueType(), Ops), 0);
3087 }
3088 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3089 return true;
3090 }
3091
3092 if (VecSize == 64 && Lo == Hi && isa<ConstantFPSDNode>(Lo)) {
3093 uint64_t Lit = cast<ConstantFPSDNode>(Lo)->getValueAPF()
3094 .bitcastToAPInt().getZExtValue();
3095 if (AMDGPU::isInlinableLiteral32(Lit, Subtarget->hasInv2PiInlineImm())) {
3096 Src = CurDAG->getTargetConstant(Lit, SDLoc(In), MVT::i64);
3097 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3098 return true;
3099 }
3100 }
3101
3102 Mods = VecMods;
3103 }
3104
3105 // Packed instructions do not have abs modifiers.
3106 Mods |= SISrcMods::OP_SEL_1;
3107
3108 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3109 return true;
3110}
3111
3112bool AMDGPUDAGToDAGISel::SelectVOP3PModsDOT(SDValue In, SDValue &Src,
3113 SDValue &SrcMods) const {
3114 return SelectVOP3PMods(In, Src, SrcMods, true);
3115}
3116
3117bool AMDGPUDAGToDAGISel::SelectVOP3PModsNeg(SDValue In, SDValue &Src) const {
3118 const ConstantSDNode *C = cast<ConstantSDNode>(In);
3119 // Literal i1 value set in intrinsic, represents SrcMods for the next operand.
3120 // 1 promotes packed values to signed, 0 treats them as unsigned.
3121 assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value");
3122
3123 unsigned Mods = SISrcMods::OP_SEL_1;
3124 unsigned SrcSign = C->getZExtValue();
3125 if (SrcSign == 1)
3126 Mods ^= SISrcMods::NEG;
3127
3128 Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3129 return true;
3130}
3131
3132bool AMDGPUDAGToDAGISel::SelectWMMAOpSelVOP3PMods(SDValue In,
3133 SDValue &Src) const {
3134 const ConstantSDNode *C = cast<ConstantSDNode>(In);
3135 assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value");
3136
3137 unsigned Mods = SISrcMods::OP_SEL_1;
3138 unsigned SrcVal = C->getZExtValue();
3139 if (SrcVal == 1)
3140 Mods |= SISrcMods::OP_SEL_0;
3141
3142 Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3143 return true;
3144}
3145
3147 llvm::SelectionDAG *CurDAG,
3148 const SDLoc &DL) {
3149 unsigned DstRegClass;
3150 EVT DstTy;
3151 switch (Elts.size()) {
3152 case 8:
3153 DstRegClass = AMDGPU::VReg_256RegClassID;
3154 DstTy = MVT::v8i32;
3155 break;
3156 case 4:
3157 DstRegClass = AMDGPU::VReg_128RegClassID;
3158 DstTy = MVT::v4i32;
3159 break;
3160 case 2:
3161 DstRegClass = AMDGPU::VReg_64RegClassID;
3162 DstTy = MVT::v2i32;
3163 break;
3164 default:
3165 llvm_unreachable("unhandled Reg sequence size");
3166 }
3167
3169 Ops.push_back(CurDAG->getTargetConstant(DstRegClass, DL, MVT::i32));
3170 for (unsigned i = 0; i < Elts.size(); ++i) {
3171 Ops.push_back(Elts[i]);
3172 Ops.push_back(CurDAG->getTargetConstant(
3174 }
3175 return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, DstTy, Ops);
3176}
3177
3179 llvm::SelectionDAG *CurDAG,
3180 const SDLoc &DL) {
3181 SmallVector<SDValue, 8> PackedElts;
3182 assert("unhandled Reg sequence size" &&
3183 (Elts.size() == 8 || Elts.size() == 16));
3184
3185 // Pack 16-bit elements in pairs into 32-bit register. If both elements are
3186 // unpacked from 32-bit source use it, otherwise pack them using v_perm.
3187 for (unsigned i = 0; i < Elts.size(); i += 2) {
3188 SDValue LoSrc = stripExtractLoElt(stripBitcast(Elts[i]));
3189 SDValue HiSrc;
3190 if (isExtractHiElt(Elts[i + 1], HiSrc) && LoSrc == HiSrc) {
3191 PackedElts.push_back(HiSrc);
3192 } else {
3193 SDValue PackLoLo = CurDAG->getTargetConstant(0x05040100, DL, MVT::i32);
3194 MachineSDNode *Packed =
3195 CurDAG->getMachineNode(AMDGPU::V_PERM_B32_e64, DL, MVT::i32,
3196 {Elts[i + 1], Elts[i], PackLoLo});
3197 PackedElts.push_back(SDValue(Packed, 0));
3198 }
3199 }
3200
3201 return buildRegSequence32(PackedElts, CurDAG, DL);
3202}
3203
3205 llvm::SelectionDAG *CurDAG,
3206 const SDLoc &DL, unsigned ElementSize) {
3207 if (ElementSize == 16)
3208 return buildRegSequence16(Elts, CurDAG, DL);
3209 if (ElementSize == 32)
3210 return buildRegSequence32(Elts, CurDAG, DL);
3211 llvm_unreachable("Unhandled element size");
3212}
3213
3214static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods,
3216 llvm::SelectionDAG *CurDAG, const SDLoc &DL,
3217 unsigned ElementSize) {
3218 if (ModOpcode == ISD::FNEG) {
3219 Mods |= SISrcMods::NEG;
3220 // Check if all elements also have abs modifier
3221 SmallVector<SDValue, 8> NegAbsElts;
3222 for (auto El : Elts) {
3223 if (El.getOpcode() != ISD::FABS)
3224 break;
3225 NegAbsElts.push_back(El->getOperand(0));
3226 }
3227 if (Elts.size() != NegAbsElts.size()) {
3228 // Neg
3229 Src = SDValue(buildRegSequence(Elts, CurDAG, DL, ElementSize), 0);
3230 } else {
3231 // Neg and Abs
3232 Mods |= SISrcMods::NEG_HI;
3233 Src = SDValue(buildRegSequence(NegAbsElts, CurDAG, DL, ElementSize), 0);
3234 }
3235 } else {
3236 assert(ModOpcode == ISD::FABS);
3237 // Abs
3238 Mods |= SISrcMods::NEG_HI;
3239 Src = SDValue(buildRegSequence(Elts, CurDAG, DL, ElementSize), 0);
3240 }
3241}
3242
3243// Check all f16 elements for modifiers while looking through b32 and v2b16
3244// build vector, stop if element does not satisfy ModifierCheck.
3245static void
3247 std::function<bool(SDValue)> ModifierCheck) {
3248 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3249 if (auto *F16Pair =
3250 dyn_cast<BuildVectorSDNode>(stripBitcast(BV->getOperand(i)))) {
3251 for (unsigned i = 0; i < F16Pair->getNumOperands(); ++i) {
3252 SDValue ElF16 = stripBitcast(F16Pair->getOperand(i));
3253 if (!ModifierCheck(ElF16))
3254 break;
3255 }
3256 }
3257 }
3258}
3259
3260bool AMDGPUDAGToDAGISel::SelectWMMAModsF16Neg(SDValue In, SDValue &Src,
3261 SDValue &SrcMods) const {
3262 Src = In;
3263 unsigned Mods = SISrcMods::OP_SEL_1;
3264
3265 // mods are on f16 elements
3266 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3268
3269 checkWMMAElementsModifiersF16(BV, [&](SDValue Element) -> bool {
3270 if (Element.getOpcode() != ISD::FNEG)
3271 return false;
3272 EltsF16.push_back(Element.getOperand(0));
3273 return true;
3274 });
3275
3276 // All elements have neg modifier
3277 if (BV->getNumOperands() * 2 == EltsF16.size()) {
3278 Src = SDValue(buildRegSequence16(EltsF16, CurDAG, SDLoc(In)), 0);
3279 Mods |= SISrcMods::NEG;
3280 Mods |= SISrcMods::NEG_HI;
3281 }
3282 }
3283
3284 // mods are on v2f16 elements
3285 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3286 SmallVector<SDValue, 8> EltsV2F16;
3287 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3288 SDValue ElV2f16 = stripBitcast(BV->getOperand(i));
3289 // Based on first element decide which mod we match, neg or abs
3290 if (ElV2f16.getOpcode() != ISD::FNEG)
3291 break;
3292 EltsV2F16.push_back(ElV2f16.getOperand(0));
3293 }
3294
3295 // All pairs of elements have neg modifier
3296 if (BV->getNumOperands() == EltsV2F16.size()) {
3297 Src = SDValue(buildRegSequence32(EltsV2F16, CurDAG, SDLoc(In)), 0);
3298 Mods |= SISrcMods::NEG;
3299 Mods |= SISrcMods::NEG_HI;
3300 }
3301 }
3302
3303 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3304 return true;
3305}
3306
3307bool AMDGPUDAGToDAGISel::SelectWMMAModsF16NegAbs(SDValue In, SDValue &Src,
3308 SDValue &SrcMods) const {
3309 Src = In;
3310 unsigned Mods = SISrcMods::OP_SEL_1;
3311 unsigned ModOpcode;
3312
3313 // mods are on f16 elements
3314 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3316 checkWMMAElementsModifiersF16(BV, [&](SDValue ElF16) -> bool {
3317 // Based on first element decide which mod we match, neg or abs
3318 if (EltsF16.empty())
3319 ModOpcode = (ElF16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3320 if (ElF16.getOpcode() != ModOpcode)
3321 return false;
3322 EltsF16.push_back(ElF16.getOperand(0));
3323 return true;
3324 });
3325
3326 // All elements have ModOpcode modifier
3327 if (BV->getNumOperands() * 2 == EltsF16.size())
3328 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF16, Src, CurDAG, SDLoc(In),
3329 16);
3330 }
3331
3332 // mods are on v2f16 elements
3333 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3334 SmallVector<SDValue, 8> EltsV2F16;
3335
3336 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3337 SDValue ElV2f16 = stripBitcast(BV->getOperand(i));
3338 // Based on first element decide which mod we match, neg or abs
3339 if (EltsV2F16.empty())
3340 ModOpcode = (ElV2f16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3341 if (ElV2f16->getOpcode() != ModOpcode)
3342 break;
3343 EltsV2F16.push_back(ElV2f16->getOperand(0));
3344 }
3345
3346 // All elements have ModOpcode modifier
3347 if (BV->getNumOperands() == EltsV2F16.size())
3348 selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, CurDAG, SDLoc(In),
3349 32);
3350 }
3351
3352 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3353 return true;
3354}
3355
3356bool AMDGPUDAGToDAGISel::SelectWMMAModsF32NegAbs(SDValue In, SDValue &Src,
3357 SDValue &SrcMods) const {
3358 Src = In;
3359 unsigned Mods = SISrcMods::OP_SEL_1;
3361
3362 if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
3363 assert(BV->getNumOperands() > 0);
3364 // Based on first element decide which mod we match, neg or abs
3365 SDValue ElF32 = stripBitcast(BV->getOperand(0));
3366 unsigned ModOpcode =
3367 (ElF32.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
3368 for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
3369 SDValue ElF32 = stripBitcast(BV->getOperand(i));
3370 if (ElF32.getOpcode() != ModOpcode)
3371 break;
3372 EltsF32.push_back(ElF32.getOperand(0));
3373 }
3374
3375 // All elements had ModOpcode modifier
3376 if (BV->getNumOperands() == EltsF32.size())
3377 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, CurDAG, SDLoc(In),
3378 32);
3379 }
3380
3381 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3382 return true;
3383}
3384
3385bool AMDGPUDAGToDAGISel::SelectWMMAVISrc(SDValue In, SDValue &Src) const {
3386 if (auto *BV = dyn_cast<BuildVectorSDNode>(In)) {
3387 BitVector UndefElements;
3388 if (SDValue Splat = BV->getSplatValue(&UndefElements))
3389 if (isInlineImmediate(Splat.getNode())) {
3390 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Splat)) {
3391 unsigned Imm = C->getAPIntValue().getSExtValue();
3392 Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32);
3393 return true;
3394 }
3395 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Splat)) {
3396 unsigned Imm = C->getValueAPF().bitcastToAPInt().getSExtValue();
3397 Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32);
3398 return true;
3399 }
3400 llvm_unreachable("unhandled Constant node");
3401 }
3402 }
3403
3404 // 16 bit splat
3405 SDValue SplatSrc32 = stripBitcast(In);
3406 if (auto *SplatSrc32BV = dyn_cast<BuildVectorSDNode>(SplatSrc32))
3407 if (SDValue Splat32 = SplatSrc32BV->getSplatValue()) {
3408 SDValue SplatSrc16 = stripBitcast(Splat32);
3409 if (auto *SplatSrc16BV = dyn_cast<BuildVectorSDNode>(SplatSrc16))
3410 if (SDValue Splat = SplatSrc16BV->getSplatValue()) {
3411 const SIInstrInfo *TII = Subtarget->getInstrInfo();
3412 std::optional<APInt> RawValue;
3413 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Splat))
3414 RawValue = C->getValueAPF().bitcastToAPInt();
3415 else if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Splat))
3416 RawValue = C->getAPIntValue();
3417
3418 if (RawValue.has_value()) {
3419 EVT VT = In.getValueType().getScalarType();
3420 if (VT.getSimpleVT() == MVT::f16 || VT.getSimpleVT() == MVT::bf16) {
3421 APFloat FloatVal(VT.getSimpleVT() == MVT::f16
3424 RawValue.value());
3425 if (TII->isInlineConstant(FloatVal)) {
3426 Src = CurDAG->getTargetConstant(RawValue.value(), SDLoc(In),
3427 MVT::i16);
3428 return true;
3429 }
3430 } else if (VT.getSimpleVT() == MVT::i16) {
3431 if (TII->isInlineConstant(RawValue.value())) {
3432 Src = CurDAG->getTargetConstant(RawValue.value(), SDLoc(In),
3433 MVT::i16);
3434 return true;
3435 }
3436 } else
3437 llvm_unreachable("unknown 16-bit type");
3438 }
3439 }
3440 }
3441
3442 return false;
3443}
3444
3445bool AMDGPUDAGToDAGISel::SelectSWMMACIndex8(SDValue In, SDValue &Src,
3446 SDValue &IndexKey) const {
3447 unsigned Key = 0;
3448 Src = In;
3449
3450 if (In.getOpcode() == ISD::SRL) {
3451 const llvm::SDValue &ShiftSrc = In.getOperand(0);
3452 ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(In.getOperand(1));
3453 if (ShiftSrc.getValueType().getSizeInBits() == 32 && ShiftAmt &&
3454 ShiftAmt->getZExtValue() % 8 == 0) {
3455 Key = ShiftAmt->getZExtValue() / 8;
3456 Src = ShiftSrc;
3457 }
3458 }
3459
3460 IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);
3461 return true;
3462}
3463
3464bool AMDGPUDAGToDAGISel::SelectSWMMACIndex16(SDValue In, SDValue &Src,
3465 SDValue &IndexKey) const {
3466 unsigned Key = 0;
3467 Src = In;
3468
3469 if (In.getOpcode() == ISD::SRL) {
3470 const llvm::SDValue &ShiftSrc = In.getOperand(0);
3471 ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(In.getOperand(1));
3472 if (ShiftSrc.getValueType().getSizeInBits() == 32 && ShiftAmt &&
3473 ShiftAmt->getZExtValue() == 16) {
3474 Key = 1;
3475 Src = ShiftSrc;
3476 }
3477 }
3478
3479 IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);
3480 return true;
3481}
3482
3483bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src,
3484 SDValue &SrcMods) const {
3485 Src = In;
3486 // FIXME: Handle op_sel
3487 SrcMods = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32);
3488 return true;
3489}
3490
3491bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods(SDValue In, SDValue &Src,
3492 SDValue &SrcMods) const {
3493 // FIXME: Handle op_sel
3494 return SelectVOP3Mods(In, Src, SrcMods);
3495}
3496
3497// The return value is not whether the match is possible (which it always is),
3498// but whether or not it a conversion is really used.
3499bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src,
3500 unsigned &Mods) const {
3501 Mods = 0;
3502 SelectVOP3ModsImpl(In, Src, Mods);
3503
3504 if (Src.getOpcode() == ISD::FP_EXTEND) {
3505 Src = Src.getOperand(0);
3506 assert(Src.getValueType() == MVT::f16);
3507 Src = stripBitcast(Src);
3508
3509 // Be careful about folding modifiers if we already have an abs. fneg is
3510 // applied last, so we don't want to apply an earlier fneg.
3511 if ((Mods & SISrcMods::ABS) == 0) {
3512 unsigned ModsTmp;
3513 SelectVOP3ModsImpl(Src, Src, ModsTmp);
3514
3515 if ((ModsTmp & SISrcMods::NEG) != 0)
3516 Mods ^= SISrcMods::NEG;
3517
3518 if ((ModsTmp & SISrcMods::ABS) != 0)
3519 Mods |= SISrcMods::ABS;
3520 }
3521
3522 // op_sel/op_sel_hi decide the source type and source.
3523 // If the source's op_sel_hi is set, it indicates to do a conversion from fp16.
3524 // If the sources's op_sel is set, it picks the high half of the source
3525 // register.
3526
3527 Mods |= SISrcMods::OP_SEL_1;
3528 if (isExtractHiElt(Src, Src)) {
3529 Mods |= SISrcMods::OP_SEL_0;
3530
3531 // TODO: Should we try to look for neg/abs here?
3532 }
3533
3534 return true;
3535 }
3536
3537 return false;
3538}
3539
3540bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsExt(SDValue In, SDValue &Src,
3541 SDValue &SrcMods) const {
3542 unsigned Mods = 0;
3543 if (!SelectVOP3PMadMixModsImpl(In, Src, Mods))
3544 return false;
3545 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3546 return true;
3547}
3548
3549bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src,
3550 SDValue &SrcMods) const {
3551 unsigned Mods = 0;
3552 SelectVOP3PMadMixModsImpl(In, Src, Mods);
3553 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
3554 return true;
3555}
3556
3557// Match BITOP3 operation and return a number of matched instructions plus
3558// truth table.
3559static std::pair<unsigned, uint8_t> BitOp3_Op(SDValue In,
3561 unsigned NumOpcodes = 0;
3562 uint8_t LHSBits, RHSBits;
3563
3564 auto getOperandBits = [&Src, In](SDValue Op, uint8_t &Bits) -> bool {
3565 // Define truth table given Src0, Src1, Src2 bits permutations:
3566 // 0 0 0
3567 // 0 0 1
3568 // 0 1 0
3569 // 0 1 1
3570 // 1 0 0
3571 // 1 0 1
3572 // 1 1 0
3573 // 1 1 1
3574 const uint8_t SrcBits[3] = { 0xf0, 0xcc, 0xaa };
3575
3576 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
3577 if (C->isAllOnes()) {
3578 Bits = 0xff;
3579 return true;
3580 }
3581 if (C->isZero()) {
3582 Bits = 0;
3583 return true;
3584 }
3585 }
3586
3587 for (unsigned I = 0; I < Src.size(); ++I) {
3588 // Try to find existing reused operand
3589 if (Src[I] == Op) {
3590 Bits = SrcBits[I];
3591 return true;
3592 }
3593 // Try to replace parent operator
3594 if (Src[I] == In) {
3595 Bits = SrcBits[I];
3596 Src[I] = Op;
3597 return true;
3598 }
3599 }
3600
3601 if (Src.size() == 3) {
3602 // No room left for operands. Try one last time, there can be a 'not' of
3603 // one of our source operands. In this case we can compute the bits
3604 // without growing Src vector.
3605 if (Op.getOpcode() == ISD::XOR) {
3606 if (auto *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
3607 if (C->isAllOnes()) {
3608 SDValue LHS = Op.getOperand(0);
3609 for (unsigned I = 0; I < Src.size(); ++I) {
3610 if (Src[I] == LHS) {
3611 Bits = ~SrcBits[I];
3612 return true;
3613 }
3614 }
3615 }
3616 }
3617 }
3618
3619 return false;
3620 }
3621
3622 Bits = SrcBits[Src.size()];
3623 Src.push_back(Op);
3624 return true;
3625 };
3626
3627 switch (In.getOpcode()) {
3628 case ISD::AND:
3629 case ISD::OR:
3630 case ISD::XOR: {
3631 SDValue LHS = In.getOperand(0);
3632 SDValue RHS = In.getOperand(1);
3633
3634 SmallVector<SDValue, 3> Backup(Src.begin(), Src.end());
3635 if (!getOperandBits(LHS, LHSBits) ||
3636 !getOperandBits(RHS, RHSBits)) {
3637 Src = Backup;
3638 return std::make_pair(0, 0);
3639 }
3640
3641 // Recursion is naturally limited by the size of the operand vector.
3642 auto Op = BitOp3_Op(LHS, Src);
3643 if (Op.first) {
3644 NumOpcodes += Op.first;
3645 LHSBits = Op.second;
3646 }
3647
3648 Op = BitOp3_Op(RHS, Src);
3649 if (Op.first) {
3650 NumOpcodes += Op.first;
3651 RHSBits = Op.second;
3652 }
3653 break;
3654 }
3655 default:
3656 return std::make_pair(0, 0);
3657 }
3658
3659 uint8_t TTbl;
3660 switch (In.getOpcode()) {
3661 case ISD::AND:
3662 TTbl = LHSBits & RHSBits;
3663 break;
3664 case ISD::OR:
3665 TTbl = LHSBits | RHSBits;
3666 break;
3667 case ISD::XOR:
3668 TTbl = LHSBits ^ RHSBits;
3669 break;
3670 default:
3671 break;
3672 }
3673
3674 return std::make_pair(NumOpcodes + 1, TTbl);
3675}
3676
3677bool AMDGPUDAGToDAGISel::SelectBITOP3(SDValue In, SDValue &Src0, SDValue &Src1,
3678 SDValue &Src2, SDValue &Tbl) const {
3680 uint8_t TTbl;
3681 unsigned NumOpcodes;
3682
3683 std::tie(NumOpcodes, TTbl) = BitOp3_Op(In, Src);
3684
3685 // Src.empty() case can happen if all operands are all zero or all ones.
3686 // Normally it shall be optimized out before reaching this.
3687 if (NumOpcodes < 2 || Src.empty())
3688 return false;
3689
3690 // For a uniform case threshold should be higher to account for moves between
3691 // VGPRs and SGPRs. It needs one operand in a VGPR, rest two can be in SGPRs
3692 // and a readtfirstlane after.
3693 if (NumOpcodes < 4 && !In->isDivergent())
3694 return false;
3695
3696 if (NumOpcodes == 2 && In.getValueType() == MVT::i32) {
3697 // Avoid using BITOP3 for OR3, XOR3, AND_OR. This is not faster but makes
3698 // asm more readable. This cannot be modeled with AddedComplexity because
3699 // selector does not know how many operations did we match.
3700 if ((In.getOpcode() == ISD::XOR || In.getOpcode() == ISD::OR) &&
3701 (In.getOperand(0).getOpcode() == In.getOpcode() ||
3702 In.getOperand(1).getOpcode() == In.getOpcode()))
3703 return false;
3704
3705 if (In.getOpcode() == ISD::OR &&
3706 (In.getOperand(0).getOpcode() == ISD::AND ||
3707 In.getOperand(1).getOpcode() == ISD::AND))
3708 return false;
3709 }
3710
3711 // Last operand can be ignored, turning a ternary operation into a binary.
3712 // For example: (~a & b & c) | (~a & b & ~c) -> (~a & b). We can replace
3713 // 'c' with 'a' here without changing the answer. In some pathological
3714 // cases it should be possible to get an operation with a single operand
3715 // too if optimizer would not catch it.
3716 while (Src.size() < 3)
3717 Src.push_back(Src[0]);
3718
3719 Src0 = Src[0];
3720 Src1 = Src[1];
3721 Src2 = Src[2];
3722
3723 Tbl = CurDAG->getTargetConstant(TTbl, SDLoc(In), MVT::i32);
3724 return true;
3725}
3726
3727SDValue AMDGPUDAGToDAGISel::getHi16Elt(SDValue In) const {
3728 if (In.isUndef())
3729 return CurDAG->getUNDEF(MVT::i32);
3730
3731 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(In)) {
3732 SDLoc SL(In);
3733 return CurDAG->getConstant(C->getZExtValue() << 16, SL, MVT::i32);
3734 }
3735
3736 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(In)) {
3737 SDLoc SL(In);
3738 return CurDAG->getConstant(
3739 C->getValueAPF().bitcastToAPInt().getZExtValue() << 16, SL, MVT::i32);
3740 }
3741
3742 SDValue Src;
3743 if (isExtractHiElt(In, Src))
3744 return Src;
3745
3746 return SDValue();
3747}
3748
3749bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
3751
3752 const SIRegisterInfo *SIRI =
3753 static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
3754 const SIInstrInfo * SII =
3755 static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
3756
3757 unsigned Limit = 0;
3758 bool AllUsesAcceptSReg = true;
3759 for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
3760 Limit < 10 && U != E; ++U, ++Limit) {
3761 const TargetRegisterClass *RC =
3762 getOperandRegClass(U->getUser(), U->getOperandNo());
3763
3764 // If the register class is unknown, it could be an unknown
3765 // register class that needs to be an SGPR, e.g. an inline asm
3766 // constraint
3767 if (!RC || SIRI->isSGPRClass(RC))
3768 return false;
3769
3770 if (RC != &AMDGPU::VS_32RegClass && RC != &AMDGPU::VS_64RegClass) {
3771 AllUsesAcceptSReg = false;
3772 SDNode *User = U->getUser();
3773 if (User->isMachineOpcode()) {
3774 unsigned Opc = User->getMachineOpcode();
3775 const MCInstrDesc &Desc = SII->get(Opc);
3776 if (Desc.isCommutable()) {
3777 unsigned OpIdx = Desc.getNumDefs() + U->getOperandNo();
3778 unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex;
3779 if (SII->findCommutedOpIndices(Desc, OpIdx, CommuteIdx1)) {
3780 unsigned CommutedOpNo = CommuteIdx1 - Desc.getNumDefs();
3781 const TargetRegisterClass *CommutedRC =
3782 getOperandRegClass(U->getUser(), CommutedOpNo);
3783 if (CommutedRC == &AMDGPU::VS_32RegClass ||
3784 CommutedRC == &AMDGPU::VS_64RegClass)
3785 AllUsesAcceptSReg = true;
3786 }
3787 }
3788 }
3789 // If "AllUsesAcceptSReg == false" so far we haven't succeeded
3790 // commuting current user. This means have at least one use
3791 // that strictly require VGPR. Thus, we will not attempt to commute
3792 // other user instructions.
3793 if (!AllUsesAcceptSReg)
3794 break;
3795 }
3796 }
3797 return !AllUsesAcceptSReg && (Limit < 10);
3798}
3799
3800bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode *N) const {
3801 const auto *Ld = cast<LoadSDNode>(N);
3802
3803 const MachineMemOperand *MMO = Ld->getMemOperand();
3804 if (N->isDivergent() && !AMDGPUInstrInfo::isUniformMMO(MMO))
3805 return false;
3806
3807 return MMO->getSize().hasValue() &&
3808 Ld->getAlign() >=
3809 Align(std::min(MMO->getSize().getValue().getKnownMinValue(),
3810 uint64_t(4))) &&
3811 ((Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
3812 Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ||
3813 (Subtarget->getScalarizeGlobalBehavior() &&
3814 Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
3815 Ld->isSimple() &&
3816 static_cast<const SITargetLowering *>(getTargetLowering())
3817 ->isMemOpHasNoClobberedMemOperand(N)));
3818}
3819
3822 *static_cast<const AMDGPUTargetLowering*>(getTargetLowering());
3823 bool IsModified = false;
3824 do {
3825 IsModified = false;
3826
3827 // Go over all selected nodes and try to fold them a bit more
3829 while (Position != CurDAG->allnodes_end()) {
3830 SDNode *Node = &*Position++;
3831 MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(Node);
3832 if (!MachineNode)
3833 continue;
3834
3835 SDNode *ResNode = Lowering.PostISelFolding(MachineNode, *CurDAG);
3836 if (ResNode != Node) {
3837 if (ResNode)
3838 ReplaceUses(Node, ResNode);
3839 IsModified = true;
3840 }
3841 }
3843 } while (IsModified);
3844}
3845
3847 CodeGenOptLevel OptLevel)
3849 ID, std::make_unique<AMDGPUDAGToDAGISel>(TM, OptLevel)) {}
3850
unsigned const MachineRegisterInfo * MRI
static bool getBaseWithOffsetUsingSplitOR(SelectionDAG &DAG, SDValue Addr, SDValue &N0, SDValue &N1)
static MachineSDNode * buildRegSequence32(SmallVectorImpl< SDValue > &Elts, llvm::SelectionDAG *CurDAG, const SDLoc &DL)
static SDValue matchZExtFromI32(SDValue Op)
static SDValue SelectSAddrFI(SelectionDAG *CurDAG, SDValue SAddr)
static MemSDNode * findMemSDNode(SDNode *N)
static MachineSDNode * buildRegSequence16(SmallVectorImpl< SDValue > &Elts, llvm::SelectionDAG *CurDAG, const SDLoc &DL)
static bool IsCopyFromSGPR(const SIRegisterInfo &TRI, SDValue Val)
static SDValue combineBallotPattern(SDValue VCMP, bool &Negate)
static void checkWMMAElementsModifiersF16(BuildVectorSDNode *BV, std::function< bool(SDValue)> ModifierCheck)
Defines an instruction selector for the AMDGPU target.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In, Register &Out)
static std::pair< unsigned, uint8_t > BitOp3_Op(Register R, SmallVectorImpl< Register > &Src, const MachineRegisterInfo &MRI)
static unsigned gwsIntrinToOpcode(unsigned IntrID)
static Register buildRegSequence(SmallVectorImpl< Register > &Elts, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods, SmallVectorImpl< Register > &Elts, Register &Src, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
Provides AMDGPU specific target descriptions.
Base class for AMDGPU specific classes of TargetSubtarget.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(...)
Definition: Debug.h:106
uint64_t Addr
uint64_t Size
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
unsigned const TargetRegisterInfo * TRI
FunctionAnalysisManager FAM
if(PassOpts->AAPipeline)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:57
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
Provides R600 specific target descriptions.
Interface definition for R600RegisterInfo.
const SmallVectorImpl< MachineOperand > & Cond
SI DAG Lowering interface definition.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
LLVM IR instance of the generic uniformity analysis.
Value * RHS
Value * LHS
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
AMDGPUDAGToDAGISelLegacy(TargetMachine &TM, CodeGenOptLevel OptLevel)
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
StringRef getPassName() const override
getPassName - Return a nice clean name for a pass.
AMDGPU specific code to select AMDGPU machine instructions for SelectionDAG operations.
void SelectBuildVector(SDNode *N, unsigned RegClassID)
void Select(SDNode *N) override
Main hook for targets to transform nodes into machine nodes.
bool runOnMachineFunction(MachineFunction &MF) override
void PreprocessISelDAG() override
PreprocessISelDAG - This hook allows targets to hack on the graph before instruction selection starts...
void PostprocessISelDAG() override
PostprocessISelDAG() - This hook allows the target to hack on the graph right after selection.
bool matchLoadD16FromBuildVector(SDNode *N) const
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
AMDGPUISelDAGToDAGPass(TargetMachine &TM)
static bool isUniformMMO(const MachineMemOperand *MMO)
unsigned getWavefrontSizeLog2() const
unsigned getWavefrontSize() const
bool hasInv2PiInlineImm() const
static SDValue stripBitcast(SDValue Val)
static int64_t getNullPointerValue(unsigned AddrSpace)
Get the integer value of a null pointer in the given address space.
Class for arbitrary precision integers.
Definition: APInt.h:78
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1520
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1542
unsigned countr_one() const
Count the number of trailing one bits.
Definition: APInt.h:1635
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:410
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:239
A "pseudo-class" with methods for operating on BUILD_VECTORs.
SDValue getSplatValue(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted value or a null value if this is not a splat.
uint64_t getZExtValue() const
int64_t getSExtValue() const
This class represents an Operation in the Expression.
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:279
Legacy analysis pass which computes a DominatorTree.
Definition: Dominators.h:317
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:310
bool hasPermlane32Swap() const
bool hasScalarCompareEq64() const
int getLDSBankCount() const
Definition: GCNSubtarget.h:350
bool hasUsableDSOffset() const
True if the offset field of DS instructions works as expected.
Definition: GCNSubtarget.h:478
bool unsafeDSOffsetFoldingEnabled() const
Definition: GCNSubtarget.h:482
bool hasFlatInstOffsets() const
Definition: GCNSubtarget.h:641
bool hasDLInsts() const
Definition: GCNSubtarget.h:779
const SIInstrInfo * getInstrInfo() const override
Definition: GCNSubtarget.h:279
unsigned getConstantBusLimit(unsigned Opcode) const
bool hasMADIntraFwdBug() const
bool privateMemoryResourceIsRangeChecked() const
Definition: GCNSubtarget.h:563
bool hasSignedScratchOffsets() const
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:291
bool hasDOTOpSelHazard() const
bool d16PreservesUnusedBits() const
Definition: GCNSubtarget.h:706
bool hasRestrictedSOffset() const
bool hasFlatSegmentOffsetBug() const
Definition: GCNSubtarget.h:694
bool getScalarizeGlobalBehavior() const
Definition: GCNSubtarget.h:988
bool ldsRequiresM0Init() const
Return if most LDS instructions have an m0 use that require m0 to be initialized.
Definition: GCNSubtarget.h:716
bool isWave32() const
bool hasPermlane16Swap() const
bool hasFlatScratchSVSSwizzleBug() const
bool hasGWS() const
bool useFlatForGlobal() const
Definition: GCNSubtarget.h:541
Generation getGeneration() const
Definition: GCNSubtarget.h:327
bool hasGWSSemaReleaseAll() const
Definition: GCNSubtarget.h:730
bool hasAddr64() const
Definition: GCNSubtarget.h:391
bool hasAddNoCarry() const
Definition: GCNSubtarget.h:738
bool hasSALUFloatInsts() const
void checkSubtargetFeatures(const Function &F) const
Diagnose inconsistent subtarget features before attempting to codegen function F.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
bool hasValue() const
TypeSize getValue() const
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:566
SmallVector< LoopT *, 4 > getLoopsInPreorder() const
Return all of the loops in the function in preorder across the loop nests, with siblings in forward p...
The legacy pass manager's analysis pass to compute loop information.
Definition: LoopInfo.h:593
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
const Triple & getTargetTriple() const
Machine Value Type.
static MVT getIntegerVT(unsigned BitWidth)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
An SDNode that represents everything that will be needed to construct a MachineInstr.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getChain() const
EVT getMemoryVT() const
Return the type of the in-memory value.
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:111
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
This class provides iterator support for SDUse operands that use a specific SDNode.
Represents one node in the SelectionDAG.
const APInt & getAsAPIntVal() const
Helper method returns the APInt value of a ConstantSDNode.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool isDivergent() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumOperands() const
Return the number of values used by this operation.
const SDValue & getOperand(unsigned Num) const
bool isPredecessorOf(const SDNode *N) const
Return true if this node is a predecessor of N.
static use_iterator use_end()
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
const SDValue & getOperand(unsigned i) const
uint64_t getConstantOperandVal(unsigned i) const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx0, unsigned &SrcOpIdx1) const override
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
const TargetRegisterClass * getRegClass(unsigned RCID) const
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isSGPRClass(const TargetRegisterClass *RC)
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
SelectionDAGISel - This is the common base class used for SelectionDAG-based pattern-matching instruc...
std::unique_ptr< FunctionLoweringInfo > FuncInfo
const TargetLowering * TLI
MachineFunction * MF
const TargetInstrInfo * TII
void ReplaceUses(SDValue F, SDValue T)
ReplaceUses - replace all uses of the old node F with the use of the new node T.
void ReplaceNode(SDNode *F, SDNode *T)
Replace all uses of F with T, then remove F from the DAG.
virtual bool runOnMachineFunction(MachineFunction &mf)
const TargetLowering * getTargetLowering() const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:228
const TargetSubtargetInfo & getSubtarget() const
Definition: SelectionDAG.h:499
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
Definition: SelectionDAG.h:801
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
SDNode * SelectNodeTo(SDNode *N, unsigned MachineOpc, EVT VT)
These are used for target selectors to mutate the specified node to have the specified return type,...
SDValue getRegister(Register Reg, EVT VT)
SDNode * MorphNodeTo(SDNode *N, unsigned Opc, SDVTList VTs, ArrayRef< SDValue > Ops)
This mutates the specified node to have the specified return type, opcode, and operands.
allnodes_const_iterator allnodes_begin() const
Definition: SelectionDAG.h:557
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
allnodes_const_iterator allnodes_end() const
Definition: SelectionDAG.h:558
void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
SDValue getTargetFrameIndex(int FI, EVT VT)
Definition: SelectionDAG.h:755
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:712
bool SignBitIsZero(SDValue Op, unsigned Depth=0) const
Return true if the sign bit of Op is known to be zero.
void RemoveDeadNodes()
This method deletes all unreachable nodes in the SelectionDAG.
void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
const TargetMachine & getTarget() const
Definition: SelectionDAG.h:498
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:700
bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:492
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition: SelectionDAG.h:580
ilist< SDNode >::iterator allnodes_iterator
Definition: SelectionDAG.h:560
bool empty() const
Definition: SmallVector.h:81
size_t size() const
Definition: SmallVector.h:78
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
static const unsigned CommuteAnyOperandIndex
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
const Triple & getTargetTriple() const
unsigned getID() const
Return the register class ID number.
ArchType getArch() const
Get the parsed architecture type of this triple.
Definition: Triple.h:395
Legacy analysis pass which computes a CycleInfo.
LLVM Value Representation.
Definition: Value.h:74
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:168
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
@ CLAMP
CLAMP value between 0.0 and 1.0.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
Key
PAL metadata keys.
std::optional< int64_t > getSMRDEncodedLiteralOffset32(const MCSubtargetInfo &ST, int64_t ByteOffset)
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isValid32BitLiteral(uint64_t Val, bool IsFP64)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST)
std::optional< int64_t > getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset, bool IsBuffer, bool HasSOffset)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:125
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:780
@ STACKRESTORE
STACKRESTORE has two operands, an input chain and a pointer to restore to it returns an output chain.
Definition: ISDOpcodes.h:1197
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:257
@ ConstantFP
Definition: ISDOpcodes.h:77
@ ADDC
Carry-setting nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:276
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition: ISDOpcodes.h:502
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:246
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1102
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:498
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition: ISDOpcodes.h:205
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:841
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:397
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:954
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition: ISDOpcodes.h:236
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
Definition: ISDOpcodes.h:997
@ CONVERGENCECTRL_GLUE
Definition: ISDOpcodes.h:1476
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition: ISDOpcodes.h:635
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:981
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition: ISDOpcodes.h:515
@ UNDEF
UNDEF - An undefined node.
Definition: ISDOpcodes.h:218
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition: ISDOpcodes.h:215
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:735
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:550
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:209
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:811
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:1031
@ TargetFrameIndex
Definition: ISDOpcodes.h:172
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:849
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:939
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:310
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:709
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:190
@ ADDE
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:286
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:920
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:817
@ BRCOND
BRCOND - Conditional branch.
Definition: ISDOpcodes.h:1141
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:198
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition: ISDOpcodes.h:530
bool isExtOpcode(unsigned Opcode)
Definition: ISDOpcodes.h:1677
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1606
@ Undef
Value of the register doesn't matter.
constexpr const char32_t SBase
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Offset
Definition: DWP.cpp:480
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
constexpr bool isMask_32(uint32_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition: MathExtras.h:269
bool isBoolSGPR(SDValue V)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:156
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
CodeGenOptLevel
Code generation optimization level.
Definition: CodeGen.h:54
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:161
static SDNode * packConstantV2I16(const SDNode *N, SelectionDAG &DAG)
FunctionPass * createAMDGPUISelDag(TargetMachine &TM, CodeGenOptLevel OptLevel)
This pass converts a legalized DAG into a AMDGPU-specific.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ Add
Sum of integers.
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition: VE.h:375
Implement std::hash so that hash_code can be used in STL containers.
Definition: BitVector.h:858
#define N
static const fltSemantics & IEEEhalf() LLVM_READNONE
Definition: APFloat.cpp:255
static const fltSemantics & BFloat() LLVM_READNONE
Definition: APFloat.cpp:256
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Description of the encoding of one expression Op.
Extended Value Type.
Definition: ValueTypes.h:35
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:368
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:380
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:311
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition: ValueTypes.h:251
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:323
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:157
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:331
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition: KnownBits.h:293
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition: KnownBits.h:336
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition: KnownBits.h:137
APInt getMinValue() const
Return the minimal unsigned value possible given these KnownBits.
Definition: KnownBits.h:121
static unsigned getSubRegFromChannel(unsigned Channel)
This represents a list of ValueType's that has been intern'd by a SelectionDAG.