LLVM 18.0.0git
AMDGPUISelDAGToDAG.cpp
Go to the documentation of this file.
1//===-- AMDGPUISelDAGToDAG.cpp - A dag to dag inst selector for AMDGPU ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//==-----------------------------------------------------------------------===//
8//
9/// \file
10/// Defines an instruction selector for the AMDGPU target.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPUISelDAGToDAG.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPUSubtarget.h"
18#include "AMDGPUTargetMachine.h"
21#include "R600RegisterInfo.h"
29#include "llvm/IR/IntrinsicsAMDGPU.h"
32
33#ifdef EXPENSIVE_CHECKS
35#include "llvm/IR/Dominators.h"
36#endif
37
38#define DEBUG_TYPE "amdgpu-isel"
39
40using namespace llvm;
41
42//===----------------------------------------------------------------------===//
43// Instruction Selector Implementation
44//===----------------------------------------------------------------------===//
45
46namespace {
47static SDValue stripBitcast(SDValue Val) {
48 return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val;
49}
50
51// Figure out if this is really an extract of the high 16-bits of a dword.
52static bool isExtractHiElt(SDValue In, SDValue &Out) {
53 In = stripBitcast(In);
54
55 if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
56 if (ConstantSDNode *Idx = dyn_cast<ConstantSDNode>(In.getOperand(1))) {
57 if (!Idx->isOne())
58 return false;
59 Out = In.getOperand(0);
60 return true;
61 }
62 }
63
64 if (In.getOpcode() != ISD::TRUNCATE)
65 return false;
66
67 SDValue Srl = In.getOperand(0);
68 if (Srl.getOpcode() == ISD::SRL) {
69 if (ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
70 if (ShiftAmt->getZExtValue() == 16) {
71 Out = stripBitcast(Srl.getOperand(0));
72 return true;
73 }
74 }
75 }
76
77 return false;
78}
79
80// Look through operations that obscure just looking at the low 16-bits of the
81// same register.
82static SDValue stripExtractLoElt(SDValue In) {
83 if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
84 SDValue Idx = In.getOperand(1);
85 if (isNullConstant(Idx) && In.getValueSizeInBits() <= 32)
86 return In.getOperand(0);
87 }
88
89 if (In.getOpcode() == ISD::TRUNCATE) {
90 SDValue Src = In.getOperand(0);
91 if (Src.getValueType().getSizeInBits() == 32)
92 return stripBitcast(Src);
93 }
94
95 return In;
96}
97
98} // end anonymous namespace
99
101 "AMDGPU DAG->DAG Pattern Instruction Selection", false, false)
105#ifdef EXPENSIVE_CHECKS
108#endif
110 "AMDGPU DAG->DAG Pattern Instruction Selection", false, false)
111
112/// This pass converts a legalized DAG into a AMDGPU-specific
113// DAG, ready for instruction scheduling.
115 CodeGenOptLevel OptLevel) {
116 return new AMDGPUDAGToDAGISel(TM, OptLevel);
117}
118
120 CodeGenOptLevel OptLevel)
121 : SelectionDAGISel(ID, TM, OptLevel) {
122 EnableLateStructurizeCFG = AMDGPUTargetMachine::EnableLateStructurizeCFG;
123}
124
126#ifdef EXPENSIVE_CHECKS
127 DominatorTree & DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
128 LoopInfo * LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
129 for (auto &L : LI->getLoopsInPreorder()) {
130 assert(L->isLCSSAForm(DT));
131 }
132#endif
133 Subtarget = &MF.getSubtarget<GCNSubtarget>();
136}
137
138bool AMDGPUDAGToDAGISel::fp16SrcZerosHighBits(unsigned Opc) const {
139 // XXX - only need to list legal operations.
140 switch (Opc) {
141 case ISD::FADD:
142 case ISD::FSUB:
143 case ISD::FMUL:
144 case ISD::FDIV:
145 case ISD::FREM:
147 case ISD::UINT_TO_FP:
148 case ISD::SINT_TO_FP:
149 case ISD::FABS:
150 // Fabs is lowered to a bit operation, but it's an and which will clear the
151 // high bits anyway.
152 case ISD::FSQRT:
153 case ISD::FSIN:
154 case ISD::FCOS:
155 case ISD::FPOWI:
156 case ISD::FPOW:
157 case ISD::FLOG:
158 case ISD::FLOG2:
159 case ISD::FLOG10:
160 case ISD::FEXP:
161 case ISD::FEXP2:
162 case ISD::FCEIL:
163 case ISD::FTRUNC:
164 case ISD::FRINT:
165 case ISD::FNEARBYINT:
166 case ISD::FROUND:
167 case ISD::FFLOOR:
168 case ISD::FMINNUM:
169 case ISD::FMAXNUM:
170 case ISD::FLDEXP:
171 case AMDGPUISD::FRACT:
172 case AMDGPUISD::CLAMP:
175 case AMDGPUISD::FMIN3:
176 case AMDGPUISD::FMAX3:
177 case AMDGPUISD::FMED3:
179 case AMDGPUISD::RCP:
180 case AMDGPUISD::RSQ:
182 // On gfx10, all 16-bit instructions preserve the high bits.
183 return Subtarget->getGeneration() <= AMDGPUSubtarget::GFX9;
184 case ISD::FP_ROUND:
185 // We may select fptrunc (fma/mad) to mad_mixlo, which does not zero the
186 // high bits on gfx9.
187 // TODO: If we had the source node we could see if the source was fma/mad
189 case ISD::FMA:
190 case ISD::FMAD:
193 default:
194 // fcopysign, select and others may be lowered to 32-bit bit operations
195 // which don't zero the high bits.
196 return false;
197 }
198}
199
203#ifdef EXPENSIVE_CHECKS
206#endif
208}
209
211 assert(Subtarget->d16PreservesUnusedBits());
212 MVT VT = N->getValueType(0).getSimpleVT();
213 if (VT != MVT::v2i16 && VT != MVT::v2f16)
214 return false;
215
216 SDValue Lo = N->getOperand(0);
217 SDValue Hi = N->getOperand(1);
218
219 LoadSDNode *LdHi = dyn_cast<LoadSDNode>(stripBitcast(Hi));
220
221 // build_vector lo, (load ptr) -> load_d16_hi ptr, lo
222 // build_vector lo, (zextload ptr from i8) -> load_d16_hi_u8 ptr, lo
223 // build_vector lo, (sextload ptr from i8) -> load_d16_hi_i8 ptr, lo
224
225 // Need to check for possible indirect dependencies on the other half of the
226 // vector to avoid introducing a cycle.
227 if (LdHi && Hi.hasOneUse() && !LdHi->isPredecessorOf(Lo.getNode())) {
228 SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
229
231 SDValue Ops[] = {
232 LdHi->getChain(), LdHi->getBasePtr(), TiedIn
233 };
234
235 unsigned LoadOp = AMDGPUISD::LOAD_D16_HI;
236 if (LdHi->getMemoryVT() == MVT::i8) {
237 LoadOp = LdHi->getExtensionType() == ISD::SEXTLOAD ?
239 } else {
240 assert(LdHi->getMemoryVT() == MVT::i16);
241 }
242
243 SDValue NewLoadHi =
244 CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdHi), VTList,
245 Ops, LdHi->getMemoryVT(),
246 LdHi->getMemOperand());
247
248 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadHi);
249 CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdHi, 1), NewLoadHi.getValue(1));
250 return true;
251 }
252
253 // build_vector (load ptr), hi -> load_d16_lo ptr, hi
254 // build_vector (zextload ptr from i8), hi -> load_d16_lo_u8 ptr, hi
255 // build_vector (sextload ptr from i8), hi -> load_d16_lo_i8 ptr, hi
256 LoadSDNode *LdLo = dyn_cast<LoadSDNode>(stripBitcast(Lo));
257 if (LdLo && Lo.hasOneUse()) {
258 SDValue TiedIn = getHi16Elt(Hi);
259 if (!TiedIn || LdLo->isPredecessorOf(TiedIn.getNode()))
260 return false;
261
262 SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
263 unsigned LoadOp = AMDGPUISD::LOAD_D16_LO;
264 if (LdLo->getMemoryVT() == MVT::i8) {
265 LoadOp = LdLo->getExtensionType() == ISD::SEXTLOAD ?
267 } else {
268 assert(LdLo->getMemoryVT() == MVT::i16);
269 }
270
271 TiedIn = CurDAG->getNode(ISD::BITCAST, SDLoc(N), VT, TiedIn);
272
273 SDValue Ops[] = {
274 LdLo->getChain(), LdLo->getBasePtr(), TiedIn
275 };
276
277 SDValue NewLoadLo =
278 CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdLo), VTList,
279 Ops, LdLo->getMemoryVT(),
280 LdLo->getMemOperand());
281
282 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadLo);
283 CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdLo, 1), NewLoadLo.getValue(1));
284 return true;
285 }
286
287 return false;
288}
289
291 if (!Subtarget->d16PreservesUnusedBits())
292 return;
293
295
296 bool MadeChange = false;
297 while (Position != CurDAG->allnodes_begin()) {
298 SDNode *N = &*--Position;
299 if (N->use_empty())
300 continue;
301
302 switch (N->getOpcode()) {
304 MadeChange |= matchLoadD16FromBuildVector(N);
305 break;
306 default:
307 break;
308 }
309 }
310
311 if (MadeChange) {
313 LLVM_DEBUG(dbgs() << "After PreProcess:\n";
314 CurDAG->dump(););
315 }
316}
317
318bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N,
319 bool Negated) const {
320 if (N->isUndef())
321 return true;
322
323 const SIInstrInfo *TII = Subtarget->getInstrInfo();
324 if (Negated) {
325 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N))
326 return TII->isInlineConstant(-C->getAPIntValue());
327
328 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N))
329 return TII->isInlineConstant(-C->getValueAPF().bitcastToAPInt());
330
331 } else {
332 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N))
333 return TII->isInlineConstant(C->getAPIntValue());
334
335 if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N))
336 return TII->isInlineConstant(C->getValueAPF().bitcastToAPInt());
337 }
338
339 return false;
340}
341
342/// Determine the register class for \p OpNo
343/// \returns The register class of the virtual register that will be used for
344/// the given operand number \OpNo or NULL if the register class cannot be
345/// determined.
346const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
347 unsigned OpNo) const {
348 if (!N->isMachineOpcode()) {
349 if (N->getOpcode() == ISD::CopyToReg) {
350 Register Reg = cast<RegisterSDNode>(N->getOperand(1))->getReg();
351 if (Reg.isVirtual()) {
353 return MRI.getRegClass(Reg);
354 }
355
356 const SIRegisterInfo *TRI
357 = static_cast<const GCNSubtarget *>(Subtarget)->getRegisterInfo();
358 return TRI->getPhysRegBaseClass(Reg);
359 }
360
361 return nullptr;
362 }
363
364 switch (N->getMachineOpcode()) {
365 default: {
366 const MCInstrDesc &Desc =
367 Subtarget->getInstrInfo()->get(N->getMachineOpcode());
368 unsigned OpIdx = Desc.getNumDefs() + OpNo;
369 if (OpIdx >= Desc.getNumOperands())
370 return nullptr;
371 int RegClass = Desc.operands()[OpIdx].RegClass;
372 if (RegClass == -1)
373 return nullptr;
374
375 return Subtarget->getRegisterInfo()->getRegClass(RegClass);
376 }
377 case AMDGPU::REG_SEQUENCE: {
378 unsigned RCID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
379 const TargetRegisterClass *SuperRC =
380 Subtarget->getRegisterInfo()->getRegClass(RCID);
381
382 SDValue SubRegOp = N->getOperand(OpNo + 1);
383 unsigned SubRegIdx = cast<ConstantSDNode>(SubRegOp)->getZExtValue();
384 return Subtarget->getRegisterInfo()->getSubClassWithSubReg(SuperRC,
385 SubRegIdx);
386 }
387 }
388}
389
390SDNode *AMDGPUDAGToDAGISel::glueCopyToOp(SDNode *N, SDValue NewChain,
391 SDValue Glue) const {
392 SmallVector <SDValue, 8> Ops;
393 Ops.push_back(NewChain); // Replace the chain.
394 for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
395 Ops.push_back(N->getOperand(i));
396
397 Ops.push_back(Glue);
398 return CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops);
399}
400
401SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N, SDValue Val) const {
403 *static_cast<const SITargetLowering*>(getTargetLowering());
404
405 assert(N->getOperand(0).getValueType() == MVT::Other && "Expected chain");
406
407 SDValue M0 = Lowering.copyToM0(*CurDAG, N->getOperand(0), SDLoc(N), Val);
408 return glueCopyToOp(N, M0, M0.getValue(1));
409}
410
411SDNode *AMDGPUDAGToDAGISel::glueCopyToM0LDSInit(SDNode *N) const {
412 unsigned AS = cast<MemSDNode>(N)->getAddressSpace();
413 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
414 if (Subtarget->ldsRequiresM0Init())
415 return glueCopyToM0(N, CurDAG->getTargetConstant(-1, SDLoc(N), MVT::i32));
416 } else if (AS == AMDGPUAS::REGION_ADDRESS) {
418 unsigned Value = MF.getInfo<SIMachineFunctionInfo>()->getGDSSize();
419 return
420 glueCopyToM0(N, CurDAG->getTargetConstant(Value, SDLoc(N), MVT::i32));
421 }
422 return N;
423}
424
425MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm,
426 EVT VT) const {
428 AMDGPU::S_MOV_B32, DL, MVT::i32,
429 CurDAG->getTargetConstant(Imm & 0xFFFFFFFF, DL, MVT::i32));
430 SDNode *Hi =
431 CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
432 CurDAG->getTargetConstant(Imm >> 32, DL, MVT::i32));
433 const SDValue Ops[] = {
434 CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
435 SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
436 SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
437
438 return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, VT, Ops);
439}
440
441void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
442 EVT VT = N->getValueType(0);
443 unsigned NumVectorElts = VT.getVectorNumElements();
444 EVT EltVT = VT.getVectorElementType();
445 SDLoc DL(N);
446 SDValue RegClass = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
447
448 if (NumVectorElts == 1) {
449 CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT, N->getOperand(0),
450 RegClass);
451 return;
452 }
453
454 assert(NumVectorElts <= 32 && "Vectors with more than 32 elements not "
455 "supported yet");
456 // 32 = Max Num Vector Elements
457 // 2 = 2 REG_SEQUENCE operands per element (value, subreg index)
458 // 1 = Vector Register Class
459 SmallVector<SDValue, 32 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1);
460
461 bool IsGCN = CurDAG->getSubtarget().getTargetTriple().getArch() ==
463 RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
464 bool IsRegSeq = true;
465 unsigned NOps = N->getNumOperands();
466 for (unsigned i = 0; i < NOps; i++) {
467 // XXX: Why is this here?
468 if (isa<RegisterSDNode>(N->getOperand(i))) {
469 IsRegSeq = false;
470 break;
471 }
472 unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
474 RegSeqArgs[1 + (2 * i)] = N->getOperand(i);
475 RegSeqArgs[1 + (2 * i) + 1] = CurDAG->getTargetConstant(Sub, DL, MVT::i32);
476 }
477 if (NOps != NumVectorElts) {
478 // Fill in the missing undef elements if this was a scalar_to_vector.
479 assert(N->getOpcode() == ISD::SCALAR_TO_VECTOR && NOps < NumVectorElts);
480 MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
481 DL, EltVT);
482 for (unsigned i = NOps; i < NumVectorElts; ++i) {
483 unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
485 RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0);
486 RegSeqArgs[1 + (2 * i) + 1] =
487 CurDAG->getTargetConstant(Sub, DL, MVT::i32);
488 }
489 }
490
491 if (!IsRegSeq)
492 SelectCode(N);
493 CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), RegSeqArgs);
494}
495
497 unsigned int Opc = N->getOpcode();
498 if (N->isMachineOpcode()) {
499 N->setNodeId(-1);
500 return; // Already selected.
501 }
502
503 // isa<MemSDNode> almost works but is slightly too permissive for some DS
504 // intrinsics.
505 if (Opc == ISD::LOAD || Opc == ISD::STORE || isa<AtomicSDNode>(N) ||
508 N = glueCopyToM0LDSInit(N);
509 SelectCode(N);
510 return;
511 }
512
513 switch (Opc) {
514 default:
515 break;
516 // We are selecting i64 ADD here instead of custom lower it during
517 // DAG legalization, so we can fold some i64 ADDs used for address
518 // calculation into the LOAD and STORE instructions.
519 case ISD::ADDC:
520 case ISD::ADDE:
521 case ISD::SUBC:
522 case ISD::SUBE: {
523 if (N->getValueType(0) != MVT::i64)
524 break;
525
526 SelectADD_SUB_I64(N);
527 return;
528 }
529 case ISD::UADDO_CARRY:
530 case ISD::USUBO_CARRY:
531 if (N->getValueType(0) != MVT::i32)
532 break;
533
534 SelectAddcSubb(N);
535 return;
536 case ISD::UADDO:
537 case ISD::USUBO: {
538 SelectUADDO_USUBO(N);
539 return;
540 }
542 SelectFMUL_W_CHAIN(N);
543 return;
544 }
546 SelectFMA_W_CHAIN(N);
547 return;
548 }
549
551 case ISD::BUILD_VECTOR: {
552 EVT VT = N->getValueType(0);
553 unsigned NumVectorElts = VT.getVectorNumElements();
554 if (VT.getScalarSizeInBits() == 16) {
555 if (Opc == ISD::BUILD_VECTOR && NumVectorElts == 2) {
556 if (SDNode *Packed = packConstantV2I16(N, *CurDAG)) {
557 ReplaceNode(N, Packed);
558 return;
559 }
560 }
561
562 break;
563 }
564
565 assert(VT.getVectorElementType().bitsEq(MVT::i32));
566 unsigned RegClassID =
567 SIRegisterInfo::getSGPRClassForBitWidth(NumVectorElts * 32)->getID();
568 SelectBuildVector(N, RegClassID);
569 return;
570 }
571 case ISD::BUILD_PAIR: {
572 SDValue RC, SubReg0, SubReg1;
573 SDLoc DL(N);
574 if (N->getValueType(0) == MVT::i128) {
575 RC = CurDAG->getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32);
576 SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32);
577 SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32);
578 } else if (N->getValueType(0) == MVT::i64) {
579 RC = CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32);
580 SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
581 SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
582 } else {
583 llvm_unreachable("Unhandled value type for BUILD_PAIR");
584 }
585 const SDValue Ops[] = { RC, N->getOperand(0), SubReg0,
586 N->getOperand(1), SubReg1 };
587 ReplaceNode(N, CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL,
588 N->getValueType(0), Ops));
589 return;
590 }
591
592 case ISD::Constant:
593 case ISD::ConstantFP: {
594 if (N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N))
595 break;
596
597 uint64_t Imm;
598 if (ConstantFPSDNode *FP = dyn_cast<ConstantFPSDNode>(N))
599 Imm = FP->getValueAPF().bitcastToAPInt().getZExtValue();
600 else {
601 ConstantSDNode *C = cast<ConstantSDNode>(N);
602 Imm = C->getZExtValue();
603 }
604
605 SDLoc DL(N);
606 ReplaceNode(N, buildSMovImm64(DL, Imm, N->getValueType(0)));
607 return;
608 }
610 case AMDGPUISD::BFE_U32: {
611 // There is a scalar version available, but unlike the vector version which
612 // has a separate operand for the offset and width, the scalar version packs
613 // the width and offset into a single operand. Try to move to the scalar
614 // version if the offsets are constant, so that we can try to keep extended
615 // loads of kernel arguments in SGPRs.
616
617 // TODO: Technically we could try to pattern match scalar bitshifts of
618 // dynamic values, but it's probably not useful.
619 ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
620 if (!Offset)
621 break;
622
623 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
624 if (!Width)
625 break;
626
627 bool Signed = Opc == AMDGPUISD::BFE_I32;
628
629 uint32_t OffsetVal = Offset->getZExtValue();
630 uint32_t WidthVal = Width->getZExtValue();
631
632 ReplaceNode(N, getBFE32(Signed, SDLoc(N), N->getOperand(0), OffsetVal,
633 WidthVal));
634 return;
635 }
637 SelectDIV_SCALE(N);
638 return;
639 }
642 SelectMAD_64_32(N);
643 return;
644 }
645 case ISD::SMUL_LOHI:
646 case ISD::UMUL_LOHI:
647 return SelectMUL_LOHI(N);
648 case ISD::CopyToReg: {
650 *static_cast<const SITargetLowering*>(getTargetLowering());
651 N = Lowering.legalizeTargetIndependentNode(N, *CurDAG);
652 break;
653 }
654 case ISD::AND:
655 case ISD::SRL:
656 case ISD::SRA:
658 if (N->getValueType(0) != MVT::i32)
659 break;
660
661 SelectS_BFE(N);
662 return;
663 case ISD::BRCOND:
664 SelectBRCOND(N);
665 return;
671 // Hack around using a legal type if f16 is illegal.
672 if (N->getValueType(0) == MVT::i32) {
673 MVT NewVT = Opc == AMDGPUISD::CVT_PKRTZ_F16_F32 ? MVT::v2f16 : MVT::v2i16;
674 N = CurDAG->MorphNodeTo(N, N->getOpcode(), CurDAG->getVTList(NewVT),
675 { N->getOperand(0), N->getOperand(1) });
676 SelectCode(N);
677 return;
678 }
679
680 break;
681 }
683 SelectINTRINSIC_W_CHAIN(N);
684 return;
685 }
687 SelectINTRINSIC_WO_CHAIN(N);
688 return;
689 }
690 case ISD::INTRINSIC_VOID: {
691 SelectINTRINSIC_VOID(N);
692 return;
693 }
695 SelectWAVE_ADDRESS(N);
696 return;
697 }
698 case ISD::STACKRESTORE: {
699 SelectSTACKRESTORE(N);
700 return;
701 }
702 }
703
704 SelectCode(N);
705}
706
707bool AMDGPUDAGToDAGISel::isUniformBr(const SDNode *N) const {
708 const BasicBlock *BB = FuncInfo->MBB->getBasicBlock();
709 const Instruction *Term = BB->getTerminator();
710 return Term->getMetadata("amdgpu.uniform") ||
711 Term->getMetadata("structurizecfg.uniform");
712}
713
714bool AMDGPUDAGToDAGISel::isUnneededShiftMask(const SDNode *N,
715 unsigned ShAmtBits) const {
716 assert(N->getOpcode() == ISD::AND);
717
718 const APInt &RHS = cast<ConstantSDNode>(N->getOperand(1))->getAPIntValue();
719 if (RHS.countr_one() >= ShAmtBits)
720 return true;
721
722 const APInt &LHSKnownZeros = CurDAG->computeKnownBits(N->getOperand(0)).Zero;
723 return (LHSKnownZeros | RHS).countr_one() >= ShAmtBits;
724}
725
727 SDValue &N0, SDValue &N1) {
728 if (Addr.getValueType() == MVT::i64 && Addr.getOpcode() == ISD::BITCAST &&
729 Addr.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
730 // As we split 64-bit `or` earlier, it's complicated pattern to match, i.e.
731 // (i64 (bitcast (v2i32 (build_vector
732 // (or (extract_vector_elt V, 0), OFFSET),
733 // (extract_vector_elt V, 1)))))
734 SDValue Lo = Addr.getOperand(0).getOperand(0);
735 if (Lo.getOpcode() == ISD::OR && DAG.isBaseWithConstantOffset(Lo)) {
736 SDValue BaseLo = Lo.getOperand(0);
737 SDValue BaseHi = Addr.getOperand(0).getOperand(1);
738 // Check that split base (Lo and Hi) are extracted from the same one.
739 if (BaseLo.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
741 BaseLo.getOperand(0) == BaseHi.getOperand(0) &&
742 // Lo is statically extracted from index 0.
743 isa<ConstantSDNode>(BaseLo.getOperand(1)) &&
744 BaseLo.getConstantOperandVal(1) == 0 &&
745 // Hi is statically extracted from index 0.
746 isa<ConstantSDNode>(BaseHi.getOperand(1)) &&
747 BaseHi.getConstantOperandVal(1) == 1) {
748 N0 = BaseLo.getOperand(0).getOperand(0);
749 N1 = Lo.getOperand(1);
750 return true;
751 }
752 }
753 }
754 return false;
755}
756
757bool AMDGPUDAGToDAGISel::isBaseWithConstantOffset64(SDValue Addr, SDValue &LHS,
758 SDValue &RHS) const {
760 LHS = Addr.getOperand(0);
761 RHS = Addr.getOperand(1);
762 return true;
763 }
764
765 if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, LHS, RHS)) {
766 assert(LHS && RHS && isa<ConstantSDNode>(RHS));
767 return true;
768 }
769
770 return false;
771}
772
774 return "AMDGPU DAG->DAG Pattern Instruction Selection";
775}
776
777//===----------------------------------------------------------------------===//
778// Complex Patterns
779//===----------------------------------------------------------------------===//
780
781bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
782 SDValue &Offset) {
783 return false;
784}
785
786bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
787 SDValue &Offset) {
789 SDLoc DL(Addr);
790
791 if ((C = dyn_cast<ConstantSDNode>(Addr))) {
792 Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
793 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
794 } else if ((Addr.getOpcode() == AMDGPUISD::DWORDADDR) &&
795 (C = dyn_cast<ConstantSDNode>(Addr.getOperand(0)))) {
796 Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
797 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
798 } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
799 (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) {
800 Base = Addr.getOperand(0);
801 Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
802 } else {
803 Base = Addr;
804 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
805 }
806
807 return true;
808}
809
810SDValue AMDGPUDAGToDAGISel::getMaterializedScalarImm32(int64_t Val,
811 const SDLoc &DL) const {
813 AMDGPU::S_MOV_B32, DL, MVT::i32,
814 CurDAG->getTargetConstant(Val, DL, MVT::i32));
815 return SDValue(Mov, 0);
816}
817
818// FIXME: Should only handle uaddo_carry/usubo_carry
819void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
820 SDLoc DL(N);
821 SDValue LHS = N->getOperand(0);
822 SDValue RHS = N->getOperand(1);
823
824 unsigned Opcode = N->getOpcode();
825 bool ConsumeCarry = (Opcode == ISD::ADDE || Opcode == ISD::SUBE);
826 bool ProduceCarry =
827 ConsumeCarry || Opcode == ISD::ADDC || Opcode == ISD::SUBC;
828 bool IsAdd = Opcode == ISD::ADD || Opcode == ISD::ADDC || Opcode == ISD::ADDE;
829
830 SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
831 SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
832
833 SDNode *Lo0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
834 DL, MVT::i32, LHS, Sub0);
835 SDNode *Hi0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
836 DL, MVT::i32, LHS, Sub1);
837
838 SDNode *Lo1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
839 DL, MVT::i32, RHS, Sub0);
840 SDNode *Hi1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
841 DL, MVT::i32, RHS, Sub1);
842
843 SDVTList VTList = CurDAG->getVTList(MVT::i32, MVT::Glue);
844
845 static const unsigned OpcMap[2][2][2] = {
846 {{AMDGPU::S_SUB_U32, AMDGPU::S_ADD_U32},
847 {AMDGPU::V_SUB_CO_U32_e32, AMDGPU::V_ADD_CO_U32_e32}},
848 {{AMDGPU::S_SUBB_U32, AMDGPU::S_ADDC_U32},
849 {AMDGPU::V_SUBB_U32_e32, AMDGPU::V_ADDC_U32_e32}}};
850
851 unsigned Opc = OpcMap[0][N->isDivergent()][IsAdd];
852 unsigned CarryOpc = OpcMap[1][N->isDivergent()][IsAdd];
853
854 SDNode *AddLo;
855 if (!ConsumeCarry) {
856 SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0) };
857 AddLo = CurDAG->getMachineNode(Opc, DL, VTList, Args);
858 } else {
859 SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0), N->getOperand(2) };
860 AddLo = CurDAG->getMachineNode(CarryOpc, DL, VTList, Args);
861 }
862 SDValue AddHiArgs[] = {
863 SDValue(Hi0, 0),
864 SDValue(Hi1, 0),
865 SDValue(AddLo, 1)
866 };
867 SDNode *AddHi = CurDAG->getMachineNode(CarryOpc, DL, VTList, AddHiArgs);
868
869 SDValue RegSequenceArgs[] = {
870 CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
871 SDValue(AddLo,0),
872 Sub0,
873 SDValue(AddHi,0),
874 Sub1,
875 };
876 SDNode *RegSequence = CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
877 MVT::i64, RegSequenceArgs);
878
879 if (ProduceCarry) {
880 // Replace the carry-use
881 ReplaceUses(SDValue(N, 1), SDValue(AddHi, 1));
882 }
883
884 // Replace the remaining uses.
885 ReplaceNode(N, RegSequence);
886}
887
888void AMDGPUDAGToDAGISel::SelectAddcSubb(SDNode *N) {
889 SDLoc DL(N);
890 SDValue LHS = N->getOperand(0);
891 SDValue RHS = N->getOperand(1);
892 SDValue CI = N->getOperand(2);
893
894 if (N->isDivergent()) {
895 unsigned Opc = N->getOpcode() == ISD::UADDO_CARRY ? AMDGPU::V_ADDC_U32_e64
896 : AMDGPU::V_SUBB_U32_e64;
898 N, Opc, N->getVTList(),
899 {LHS, RHS, CI,
900 CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
901 } else {
902 unsigned Opc = N->getOpcode() == ISD::UADDO_CARRY ? AMDGPU::S_ADD_CO_PSEUDO
903 : AMDGPU::S_SUB_CO_PSEUDO;
904 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), {LHS, RHS, CI});
905 }
906}
907
908void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) {
909 // The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned
910 // carry out despite the _i32 name. These were renamed in VI to _U32.
911 // FIXME: We should probably rename the opcodes here.
912 bool IsAdd = N->getOpcode() == ISD::UADDO;
913 bool IsVALU = N->isDivergent();
914
915 for (SDNode::use_iterator UI = N->use_begin(), E = N->use_end(); UI != E;
916 ++UI)
917 if (UI.getUse().getResNo() == 1) {
918 if ((IsAdd && (UI->getOpcode() != ISD::UADDO_CARRY)) ||
919 (!IsAdd && (UI->getOpcode() != ISD::USUBO_CARRY))) {
920 IsVALU = true;
921 break;
922 }
923 }
924
925 if (IsVALU) {
926 unsigned Opc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
927
929 N, Opc, N->getVTList(),
930 {N->getOperand(0), N->getOperand(1),
931 CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
932 } else {
933 unsigned Opc = N->getOpcode() == ISD::UADDO ? AMDGPU::S_UADDO_PSEUDO
934 : AMDGPU::S_USUBO_PSEUDO;
935
936 CurDAG->SelectNodeTo(N, Opc, N->getVTList(),
937 {N->getOperand(0), N->getOperand(1)});
938 }
939}
940
941void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) {
942 SDLoc SL(N);
943 // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp, omod
944 SDValue Ops[10];
945
946 SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[6], Ops[7]);
947 SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
948 SelectVOP3Mods(N->getOperand(3), Ops[5], Ops[4]);
949 Ops[8] = N->getOperand(0);
950 Ops[9] = N->getOperand(4);
951
952 // If there are no source modifiers, prefer fmac over fma because it can use
953 // the smaller VOP2 encoding.
954 bool UseFMAC = Subtarget->hasDLInsts() &&
955 cast<ConstantSDNode>(Ops[0])->isZero() &&
956 cast<ConstantSDNode>(Ops[2])->isZero() &&
957 cast<ConstantSDNode>(Ops[4])->isZero();
958 unsigned Opcode = UseFMAC ? AMDGPU::V_FMAC_F32_e64 : AMDGPU::V_FMA_F32_e64;
959 CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), Ops);
960}
961
962void AMDGPUDAGToDAGISel::SelectFMUL_W_CHAIN(SDNode *N) {
963 SDLoc SL(N);
964 // src0_modifiers, src0, src1_modifiers, src1, clamp, omod
965 SDValue Ops[8];
966
967 SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[4], Ops[5]);
968 SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
969 Ops[6] = N->getOperand(0);
970 Ops[7] = N->getOperand(3);
971
972 CurDAG->SelectNodeTo(N, AMDGPU::V_MUL_F32_e64, N->getVTList(), Ops);
973}
974
975// We need to handle this here because tablegen doesn't support matching
976// instructions with multiple outputs.
977void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
978 SDLoc SL(N);
979 EVT VT = N->getValueType(0);
980
981 assert(VT == MVT::f32 || VT == MVT::f64);
982
983 unsigned Opc
984 = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64_e64 : AMDGPU::V_DIV_SCALE_F32_e64;
985
986 // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp,
987 // omod
988 SDValue Ops[8];
989 SelectVOP3BMods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]);
990 SelectVOP3BMods(N->getOperand(1), Ops[3], Ops[2]);
991 SelectVOP3BMods(N->getOperand(2), Ops[5], Ops[4]);
992 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
993}
994
995// We need to handle this here because tablegen doesn't support matching
996// instructions with multiple outputs.
997void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {
998 SDLoc SL(N);
999 bool Signed = N->getOpcode() == AMDGPUISD::MAD_I64_I32;
1000 unsigned Opc;
1001 if (Subtarget->hasMADIntraFwdBug())
1002 Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
1003 : AMDGPU::V_MAD_U64_U32_gfx11_e64;
1004 else
1005 Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
1006
1007 SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
1008 SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
1009 Clamp };
1010 CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
1011}
1012
1013// We need to handle this here because tablegen doesn't support matching
1014// instructions with multiple outputs.
1015void AMDGPUDAGToDAGISel::SelectMUL_LOHI(SDNode *N) {
1016 SDLoc SL(N);
1017 bool Signed = N->getOpcode() == ISD::SMUL_LOHI;
1018 unsigned Opc;
1019 if (Subtarget->hasMADIntraFwdBug())
1020 Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
1021 : AMDGPU::V_MAD_U64_U32_gfx11_e64;
1022 else
1023 Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
1024
1025 SDValue Zero = CurDAG->getTargetConstant(0, SL, MVT::i64);
1026 SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
1027 SDValue Ops[] = {N->getOperand(0), N->getOperand(1), Zero, Clamp};
1028 SDNode *Mad = CurDAG->getMachineNode(Opc, SL, N->getVTList(), Ops);
1029 if (!SDValue(N, 0).use_empty()) {
1030 SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32);
1031 SDNode *Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL,
1032 MVT::i32, SDValue(Mad, 0), Sub0);
1033 ReplaceUses(SDValue(N, 0), SDValue(Lo, 0));
1034 }
1035 if (!SDValue(N, 1).use_empty()) {
1036 SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32);
1037 SDNode *Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL,
1038 MVT::i32, SDValue(Mad, 0), Sub1);
1039 ReplaceUses(SDValue(N, 1), SDValue(Hi, 0));
1040 }
1042}
1043
1044bool AMDGPUDAGToDAGISel::isDSOffsetLegal(SDValue Base, unsigned Offset) const {
1045 if (!isUInt<16>(Offset))
1046 return false;
1047
1048 if (!Base || Subtarget->hasUsableDSOffset() ||
1049 Subtarget->unsafeDSOffsetFoldingEnabled())
1050 return true;
1051
1052 // On Southern Islands instruction with a negative base value and an offset
1053 // don't seem to work.
1054 return CurDAG->SignBitIsZero(Base);
1055}
1056
1057bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
1058 SDValue &Offset) const {
1059 SDLoc DL(Addr);
1061 SDValue N0 = Addr.getOperand(0);
1062 SDValue N1 = Addr.getOperand(1);
1063 ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1064 if (isDSOffsetLegal(N0, C1->getSExtValue())) {
1065 // (add n0, c0)
1066 Base = N0;
1067 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
1068 return true;
1069 }
1070 } else if (Addr.getOpcode() == ISD::SUB) {
1071 // sub C, x -> add (sub 0, x), C
1072 if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
1073 int64_t ByteOffset = C->getSExtValue();
1074 if (isDSOffsetLegal(SDValue(), ByteOffset)) {
1075 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1076
1077 // XXX - This is kind of hacky. Create a dummy sub node so we can check
1078 // the known bits in isDSOffsetLegal. We need to emit the selected node
1079 // here, so this is thrown away.
1080 SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32,
1081 Zero, Addr.getOperand(1));
1082
1083 if (isDSOffsetLegal(Sub, ByteOffset)) {
1085 Opnds.push_back(Zero);
1086 Opnds.push_back(Addr.getOperand(1));
1087
1088 // FIXME: Select to VOP3 version for with-carry.
1089 unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1090 if (Subtarget->hasAddNoCarry()) {
1091 SubOp = AMDGPU::V_SUB_U32_e64;
1092 Opnds.push_back(
1093 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
1094 }
1095
1096 MachineSDNode *MachineSub =
1097 CurDAG->getMachineNode(SubOp, DL, MVT::i32, Opnds);
1098
1099 Base = SDValue(MachineSub, 0);
1100 Offset = CurDAG->getTargetConstant(ByteOffset, DL, MVT::i16);
1101 return true;
1102 }
1103 }
1104 }
1105 } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1106 // If we have a constant address, prefer to put the constant into the
1107 // offset. This can save moves to load the constant address since multiple
1108 // operations can share the zero base address register, and enables merging
1109 // into read2 / write2 instructions.
1110
1111 SDLoc DL(Addr);
1112
1113 if (isDSOffsetLegal(SDValue(), CAddr->getZExtValue())) {
1114 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1115 MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
1116 DL, MVT::i32, Zero);
1117 Base = SDValue(MovZero, 0);
1118 Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16);
1119 return true;
1120 }
1121 }
1122
1123 // default case
1124 Base = Addr;
1125 Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i16);
1126 return true;
1127}
1128
1129bool AMDGPUDAGToDAGISel::isDSOffset2Legal(SDValue Base, unsigned Offset0,
1130 unsigned Offset1,
1131 unsigned Size) const {
1132 if (Offset0 % Size != 0 || Offset1 % Size != 0)
1133 return false;
1134 if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
1135 return false;
1136
1137 if (!Base || Subtarget->hasUsableDSOffset() ||
1138 Subtarget->unsafeDSOffsetFoldingEnabled())
1139 return true;
1140
1141 // On Southern Islands instruction with a negative base value and an offset
1142 // don't seem to work.
1143 return CurDAG->SignBitIsZero(Base);
1144}
1145
1146bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue Base,
1147 uint64_t FlatVariant) const {
1148 if (FlatVariant != SIInstrFlags::FlatScratch)
1149 return true;
1150 // When value in 32-bit Base can be negative calculate scratch offset using
1151 // 32-bit add instruction, otherwise use Base(unsigned) + offset.
1152 return CurDAG->SignBitIsZero(Base);
1153}
1154
1155// TODO: If offset is too big, put low 16-bit into offset.
1156bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,
1157 SDValue &Offset0,
1158 SDValue &Offset1) const {
1159 return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 4);
1160}
1161
1162bool AMDGPUDAGToDAGISel::SelectDS128Bit8ByteAligned(SDValue Addr, SDValue &Base,
1163 SDValue &Offset0,
1164 SDValue &Offset1) const {
1165 return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 8);
1166}
1167
1168bool AMDGPUDAGToDAGISel::SelectDSReadWrite2(SDValue Addr, SDValue &Base,
1169 SDValue &Offset0, SDValue &Offset1,
1170 unsigned Size) const {
1171 SDLoc DL(Addr);
1172
1174 SDValue N0 = Addr.getOperand(0);
1175 SDValue N1 = Addr.getOperand(1);
1176 ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1177 unsigned OffsetValue0 = C1->getZExtValue();
1178 unsigned OffsetValue1 = OffsetValue0 + Size;
1179
1180 // (add n0, c0)
1181 if (isDSOffset2Legal(N0, OffsetValue0, OffsetValue1, Size)) {
1182 Base = N0;
1183 Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i8);
1184 Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i8);
1185 return true;
1186 }
1187 } else if (Addr.getOpcode() == ISD::SUB) {
1188 // sub C, x -> add (sub 0, x), C
1189 if (const ConstantSDNode *C =
1190 dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
1191 unsigned OffsetValue0 = C->getZExtValue();
1192 unsigned OffsetValue1 = OffsetValue0 + Size;
1193
1194 if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {
1195 SDLoc DL(Addr);
1196 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1197
1198 // XXX - This is kind of hacky. Create a dummy sub node so we can check
1199 // the known bits in isDSOffsetLegal. We need to emit the selected node
1200 // here, so this is thrown away.
1201 SDValue Sub =
1202 CurDAG->getNode(ISD::SUB, DL, MVT::i32, Zero, Addr.getOperand(1));
1203
1204 if (isDSOffset2Legal(Sub, OffsetValue0, OffsetValue1, Size)) {
1206 Opnds.push_back(Zero);
1207 Opnds.push_back(Addr.getOperand(1));
1208 unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1209 if (Subtarget->hasAddNoCarry()) {
1210 SubOp = AMDGPU::V_SUB_U32_e64;
1211 Opnds.push_back(
1212 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
1213 }
1214
1215 MachineSDNode *MachineSub = CurDAG->getMachineNode(
1216 SubOp, DL, MVT::getIntegerVT(Size * 8), Opnds);
1217
1218 Base = SDValue(MachineSub, 0);
1219 Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i8);
1220 Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i8);
1221 return true;
1222 }
1223 }
1224 }
1225 } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1226 unsigned OffsetValue0 = CAddr->getZExtValue();
1227 unsigned OffsetValue1 = OffsetValue0 + Size;
1228
1229 if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {
1230 SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1231 MachineSDNode *MovZero =
1232 CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32, Zero);
1233 Base = SDValue(MovZero, 0);
1234 Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i8);
1235 Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i8);
1236 return true;
1237 }
1238 }
1239
1240 // default case
1241
1242 Base = Addr;
1243 Offset0 = CurDAG->getTargetConstant(0, DL, MVT::i8);
1244 Offset1 = CurDAG->getTargetConstant(1, DL, MVT::i8);
1245 return true;
1246}
1247
1248bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDValue &VAddr,
1249 SDValue &SOffset, SDValue &Offset,
1250 SDValue &Offen, SDValue &Idxen,
1251 SDValue &Addr64) const {
1252 // Subtarget prefers to use flat instruction
1253 // FIXME: This should be a pattern predicate and not reach here
1254 if (Subtarget->useFlatForGlobal())
1255 return false;
1256
1257 SDLoc DL(Addr);
1258
1259 Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1);
1260 Offen = CurDAG->getTargetConstant(0, DL, MVT::i1);
1261 Addr64 = CurDAG->getTargetConstant(0, DL, MVT::i1);
1262 SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1263
1264 ConstantSDNode *C1 = nullptr;
1265 SDValue N0 = Addr;
1267 C1 = cast<ConstantSDNode>(Addr.getOperand(1));
1268 if (isUInt<32>(C1->getZExtValue()))
1269 N0 = Addr.getOperand(0);
1270 else
1271 C1 = nullptr;
1272 }
1273
1274 if (N0.getOpcode() == ISD::ADD) {
1275 // (add N2, N3) -> addr64, or
1276 // (add (add N2, N3), C1) -> addr64
1277 SDValue N2 = N0.getOperand(0);
1278 SDValue N3 = N0.getOperand(1);
1279 Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
1280
1281 if (N2->isDivergent()) {
1282 if (N3->isDivergent()) {
1283 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
1284 // addr64, and construct the resource from a 0 address.
1285 Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
1286 VAddr = N0;
1287 } else {
1288 // N2 is divergent, N3 is not.
1289 Ptr = N3;
1290 VAddr = N2;
1291 }
1292 } else {
1293 // N2 is not divergent.
1294 Ptr = N2;
1295 VAddr = N3;
1296 }
1297 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1298 } else if (N0->isDivergent()) {
1299 // N0 is divergent. Use it as the addr64, and construct the resource from a
1300 // 0 address.
1301 Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
1302 VAddr = N0;
1303 Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
1304 } else {
1305 // N0 -> offset, or
1306 // (N0 + C1) -> offset
1307 VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32);
1308 Ptr = N0;
1309 }
1310
1311 if (!C1) {
1312 // No offset.
1313 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1314 return true;
1315 }
1316
1318 // Legal offset for instruction.
1319 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
1320 return true;
1321 }
1322
1323 // Illegal offset, store it in soffset.
1324 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1325 SOffset =
1327 AMDGPU::S_MOV_B32, DL, MVT::i32,
1328 CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32)),
1329 0);
1330 return true;
1331}
1332
1333bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
1334 SDValue &VAddr, SDValue &SOffset,
1335 SDValue &Offset) const {
1336 SDValue Ptr, Offen, Idxen, Addr64;
1337
1338 // addr64 bit was removed for volcanic islands.
1339 // FIXME: This should be a pattern predicate and not reach here
1340 if (!Subtarget->hasAddr64())
1341 return false;
1342
1343 if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
1344 return false;
1345
1346 ConstantSDNode *C = cast<ConstantSDNode>(Addr64);
1347 if (C->getSExtValue()) {
1348 SDLoc DL(Addr);
1349
1350 const SITargetLowering& Lowering =
1351 *static_cast<const SITargetLowering*>(getTargetLowering());
1352
1353 SRsrc = SDValue(Lowering.wrapAddr64Rsrc(*CurDAG, DL, Ptr), 0);
1354 return true;
1355 }
1356
1357 return false;
1358}
1359
1360std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const {
1361 SDLoc DL(N);
1362
1363 auto *FI = dyn_cast<FrameIndexSDNode>(N);
1364 SDValue TFI =
1365 FI ? CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0)) : N;
1366
1367 // We rebase the base address into an absolute stack address and hence
1368 // use constant 0 for soffset. This value must be retained until
1369 // frame elimination and eliminateFrameIndex will choose the appropriate
1370 // frame register if need be.
1371 return std::pair(TFI, CurDAG->getTargetConstant(0, DL, MVT::i32));
1372}
1373
1374bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent,
1375 SDValue Addr, SDValue &Rsrc,
1376 SDValue &VAddr, SDValue &SOffset,
1377 SDValue &ImmOffset) const {
1378
1379 SDLoc DL(Addr);
1382
1383 Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1384
1385 if (ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1386 int64_t Imm = CAddr->getSExtValue();
1387 const int64_t NullPtr =
1389 // Don't fold null pointer.
1390 if (Imm != NullPtr) {
1391 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset();
1392 SDValue HighBits =
1393 CurDAG->getTargetConstant(Imm & ~MaxOffset, DL, MVT::i32);
1394 MachineSDNode *MovHighBits = CurDAG->getMachineNode(
1395 AMDGPU::V_MOV_B32_e32, DL, MVT::i32, HighBits);
1396 VAddr = SDValue(MovHighBits, 0);
1397
1398 SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1399 ImmOffset = CurDAG->getTargetConstant(Imm & MaxOffset, DL, MVT::i32);
1400 return true;
1401 }
1402 }
1403
1405 // (add n0, c1)
1406
1407 SDValue N0 = Addr.getOperand(0);
1408 SDValue N1 = Addr.getOperand(1);
1409
1410 // Offsets in vaddr must be positive if range checking is enabled.
1411 //
1412 // The total computation of vaddr + soffset + offset must not overflow. If
1413 // vaddr is negative, even if offset is 0 the sgpr offset add will end up
1414 // overflowing.
1415 //
1416 // Prior to gfx9, MUBUF instructions with the vaddr offset enabled would
1417 // always perform a range check. If a negative vaddr base index was used,
1418 // this would fail the range check. The overall address computation would
1419 // compute a valid address, but this doesn't happen due to the range
1420 // check. For out-of-bounds MUBUF loads, a 0 is returned.
1421 //
1422 // Therefore it should be safe to fold any VGPR offset on gfx9 into the
1423 // MUBUF vaddr, but not on older subtargets which can only do this if the
1424 // sign bit is known 0.
1425 ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1427 (!Subtarget->privateMemoryResourceIsRangeChecked() ||
1428 CurDAG->SignBitIsZero(N0))) {
1429 std::tie(VAddr, SOffset) = foldFrameIndex(N0);
1430 ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
1431 return true;
1432 }
1433 }
1434
1435 // (node)
1436 std::tie(VAddr, SOffset) = foldFrameIndex(Addr);
1437 ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1438 return true;
1439}
1440
1441static bool IsCopyFromSGPR(const SIRegisterInfo &TRI, SDValue Val) {
1442 if (Val.getOpcode() != ISD::CopyFromReg)
1443 return false;
1444 auto Reg = cast<RegisterSDNode>(Val.getOperand(1))->getReg();
1445 if (!Reg.isPhysical())
1446 return false;
1447 auto RC = TRI.getPhysRegBaseClass(Reg);
1448 return RC && TRI.isSGPRClass(RC);
1449}
1450
1451bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent,
1452 SDValue Addr,
1453 SDValue &SRsrc,
1454 SDValue &SOffset,
1455 SDValue &Offset) const {
1456 const SIRegisterInfo *TRI =
1457 static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
1460 SDLoc DL(Addr);
1461
1462 // CopyFromReg <sgpr>
1463 if (IsCopyFromSGPR(*TRI, Addr)) {
1464 SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1465 SOffset = Addr;
1466 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1467 return true;
1468 }
1469
1470 ConstantSDNode *CAddr;
1471 if (Addr.getOpcode() == ISD::ADD) {
1472 // Add (CopyFromReg <sgpr>) <constant>
1473 CAddr = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
1474 if (!CAddr || !SIInstrInfo::isLegalMUBUFImmOffset(CAddr->getZExtValue()))
1475 return false;
1476 if (!IsCopyFromSGPR(*TRI, Addr.getOperand(0)))
1477 return false;
1478
1479 SOffset = Addr.getOperand(0);
1480 } else if ((CAddr = dyn_cast<ConstantSDNode>(Addr)) &&
1482 // <constant>
1483 SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1484 } else {
1485 return false;
1486 }
1487
1488 SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1489
1490 Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i32);
1491 return true;
1492}
1493
1494bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
1495 SDValue &SOffset, SDValue &Offset
1496 ) const {
1497 SDValue Ptr, VAddr, Offen, Idxen, Addr64;
1498 const SIInstrInfo *TII =
1499 static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
1500
1501 if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
1502 return false;
1503
1504 if (!cast<ConstantSDNode>(Offen)->getSExtValue() &&
1505 !cast<ConstantSDNode>(Idxen)->getSExtValue() &&
1506 !cast<ConstantSDNode>(Addr64)->getSExtValue()) {
1507 uint64_t Rsrc = TII->getDefaultRsrcDataFormat() |
1508 APInt::getAllOnes(32).getZExtValue(); // Size
1509 SDLoc DL(Addr);
1510
1511 const SITargetLowering& Lowering =
1512 *static_cast<const SITargetLowering*>(getTargetLowering());
1513
1514 SRsrc = SDValue(Lowering.buildRSRC(*CurDAG, DL, Ptr, 0, Rsrc), 0);
1515 return true;
1516 }
1517 return false;
1518}
1519
1520// Find a load or store from corresponding pattern root.
1521// Roots may be build_vector, bitconvert or their combinations.
1524 if (MemSDNode *MN = dyn_cast<MemSDNode>(N))
1525 return MN;
1526 assert(isa<BuildVectorSDNode>(N));
1527 for (SDValue V : N->op_values())
1528 if (MemSDNode *MN =
1529 dyn_cast<MemSDNode>(AMDGPUTargetLowering::stripBitcast(V)))
1530 return MN;
1531 llvm_unreachable("cannot find MemSDNode in the pattern!");
1532}
1533
1534bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr,
1535 SDValue &VAddr, SDValue &Offset,
1536 uint64_t FlatVariant) const {
1537 int64_t OffsetVal = 0;
1538
1539 unsigned AS = findMemSDNode(N)->getAddressSpace();
1540
1541 bool CanHaveFlatSegmentOffsetBug =
1542 Subtarget->hasFlatSegmentOffsetBug() &&
1543 FlatVariant == SIInstrFlags::FLAT &&
1545
1546 if (Subtarget->hasFlatInstOffsets() && !CanHaveFlatSegmentOffsetBug) {
1547 SDValue N0, N1;
1548 if (isBaseWithConstantOffset64(Addr, N0, N1) &&
1549 isFlatScratchBaseLegal(N0, FlatVariant)) {
1550 int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
1551
1552 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1553 if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) {
1554 Addr = N0;
1555 OffsetVal = COffsetVal;
1556 } else {
1557 // If the offset doesn't fit, put the low bits into the offset field and
1558 // add the rest.
1559 //
1560 // For a FLAT instruction the hardware decides whether to access
1561 // global/scratch/shared memory based on the high bits of vaddr,
1562 // ignoring the offset field, so we have to ensure that when we add
1563 // remainder to vaddr it still points into the same underlying object.
1564 // The easiest way to do that is to make sure that we split the offset
1565 // into two pieces that are both >= 0 or both <= 0.
1566
1567 SDLoc DL(N);
1568 uint64_t RemainderOffset;
1569
1570 std::tie(OffsetVal, RemainderOffset) =
1571 TII->splitFlatOffset(COffsetVal, AS, FlatVariant);
1572
1573 SDValue AddOffsetLo =
1574 getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
1575 SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
1576
1577 if (Addr.getValueType().getSizeInBits() == 32) {
1579 Opnds.push_back(N0);
1580 Opnds.push_back(AddOffsetLo);
1581 unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32;
1582 if (Subtarget->hasAddNoCarry()) {
1583 AddOp = AMDGPU::V_ADD_U32_e64;
1584 Opnds.push_back(Clamp);
1585 }
1586 Addr = SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0);
1587 } else {
1588 // TODO: Should this try to use a scalar add pseudo if the base address
1589 // is uniform and saddr is usable?
1590 SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
1591 SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
1592
1593 SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1594 DL, MVT::i32, N0, Sub0);
1595 SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1596 DL, MVT::i32, N0, Sub1);
1597
1598 SDValue AddOffsetHi =
1599 getMaterializedScalarImm32(Hi_32(RemainderOffset), DL);
1600
1601 SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1);
1602
1603 SDNode *Add =
1604 CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs,
1605 {AddOffsetLo, SDValue(N0Lo, 0), Clamp});
1606
1607 SDNode *Addc = CurDAG->getMachineNode(
1608 AMDGPU::V_ADDC_U32_e64, DL, VTs,
1609 {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp});
1610
1611 SDValue RegSequenceArgs[] = {
1612 CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32),
1613 SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1};
1614
1615 Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
1616 MVT::i64, RegSequenceArgs),
1617 0);
1618 }
1619 }
1620 }
1621 }
1622
1623 VAddr = Addr;
1624 Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i16);
1625 return true;
1626}
1627
1628bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N, SDValue Addr,
1629 SDValue &VAddr,
1630 SDValue &Offset) const {
1631 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, SIInstrFlags::FLAT);
1632}
1633
1634bool AMDGPUDAGToDAGISel::SelectGlobalOffset(SDNode *N, SDValue Addr,
1635 SDValue &VAddr,
1636 SDValue &Offset) const {
1637 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, SIInstrFlags::FlatGlobal);
1638}
1639
1640bool AMDGPUDAGToDAGISel::SelectScratchOffset(SDNode *N, SDValue Addr,
1641 SDValue &VAddr,
1642 SDValue &Offset) const {
1643 return SelectFlatOffsetImpl(N, Addr, VAddr, Offset,
1645}
1646
1647// If this matches zero_extend i32:x, return x
1649 if (Op.getOpcode() != ISD::ZERO_EXTEND)
1650 return SDValue();
1651
1652 SDValue ExtSrc = Op.getOperand(0);
1653 return (ExtSrc.getValueType() == MVT::i32) ? ExtSrc : SDValue();
1654}
1655
1656// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
1657bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
1658 SDValue Addr,
1659 SDValue &SAddr,
1660 SDValue &VOffset,
1661 SDValue &Offset) const {
1662 int64_t ImmOffset = 0;
1663
1664 // Match the immediate offset first, which canonically is moved as low as
1665 // possible.
1666
1667 SDValue LHS, RHS;
1668 if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
1669 int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
1670 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1671
1672 if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS,
1674 Addr = LHS;
1675 ImmOffset = COffsetVal;
1676 } else if (!LHS->isDivergent()) {
1677 if (COffsetVal > 0) {
1678 SDLoc SL(N);
1679 // saddr + large_offset -> saddr +
1680 // (voffset = large_offset & ~MaxOffset) +
1681 // (large_offset & MaxOffset);
1682 int64_t SplitImmOffset, RemainderOffset;
1683 std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
1685
1686 if (isUInt<32>(RemainderOffset)) {
1687 SDNode *VMov = CurDAG->getMachineNode(
1688 AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
1689 CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
1690 VOffset = SDValue(VMov, 0);
1691 SAddr = LHS;
1692 Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i16);
1693 return true;
1694 }
1695 }
1696
1697 // We are adding a 64 bit SGPR and a constant. If constant bus limit
1698 // is 1 we would need to perform 1 or 2 extra moves for each half of
1699 // the constant and it is better to do a scalar add and then issue a
1700 // single VALU instruction to materialize zero. Otherwise it is less
1701 // instructions to perform VALU adds with immediates or inline literals.
1702 unsigned NumLiterals =
1703 !TII->isInlineConstant(APInt(32, COffsetVal & 0xffffffff)) +
1704 !TII->isInlineConstant(APInt(32, COffsetVal >> 32));
1705 if (Subtarget->getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
1706 return false;
1707 }
1708 }
1709
1710 // Match the variable offset.
1711 if (Addr.getOpcode() == ISD::ADD) {
1712 LHS = Addr.getOperand(0);
1713 RHS = Addr.getOperand(1);
1714
1715 if (!LHS->isDivergent()) {
1716 // add (i64 sgpr), (zero_extend (i32 vgpr))
1717 if (SDValue ZextRHS = matchZExtFromI32(RHS)) {
1718 SAddr = LHS;
1719 VOffset = ZextRHS;
1720 }
1721 }
1722
1723 if (!SAddr && !RHS->isDivergent()) {
1724 // add (zero_extend (i32 vgpr)), (i64 sgpr)
1725 if (SDValue ZextLHS = matchZExtFromI32(LHS)) {
1726 SAddr = RHS;
1727 VOffset = ZextLHS;
1728 }
1729 }
1730
1731 if (SAddr) {
1732 Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16);
1733 return true;
1734 }
1735 }
1736
1737 if (Addr->isDivergent() || Addr.getOpcode() == ISD::UNDEF ||
1738 isa<ConstantSDNode>(Addr))
1739 return false;
1740
1741 // It's cheaper to materialize a single 32-bit zero for vaddr than the two
1742 // moves required to copy a 64-bit SGPR to VGPR.
1743 SAddr = Addr;
1744 SDNode *VMov =
1745 CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, SDLoc(Addr), MVT::i32,
1746 CurDAG->getTargetConstant(0, SDLoc(), MVT::i32));
1747 VOffset = SDValue(VMov, 0);
1748 Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16);
1749 return true;
1750}
1751
1753 if (auto FI = dyn_cast<FrameIndexSDNode>(SAddr)) {
1754 SAddr = CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0));
1755 } else if (SAddr.getOpcode() == ISD::ADD &&
1756 isa<FrameIndexSDNode>(SAddr.getOperand(0))) {
1757 // Materialize this into a scalar move for scalar address to avoid
1758 // readfirstlane.
1759 auto FI = cast<FrameIndexSDNode>(SAddr.getOperand(0));
1760 SDValue TFI = CurDAG->getTargetFrameIndex(FI->getIndex(),
1761 FI->getValueType(0));
1762 SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, SDLoc(SAddr),
1763 MVT::i32, TFI, SAddr.getOperand(1)),
1764 0);
1765 }
1766
1767 return SAddr;
1768}
1769
1770// Match (32-bit SGPR base) + sext(imm offset)
1771bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr,
1772 SDValue &SAddr,
1773 SDValue &Offset) const {
1774 if (Addr->isDivergent())
1775 return false;
1776
1777 SDLoc DL(Addr);
1778
1779 int64_t COffsetVal = 0;
1780
1782 isFlatScratchBaseLegal(Addr.getOperand(0))) {
1783 COffsetVal = cast<ConstantSDNode>(Addr.getOperand(1))->getSExtValue();
1784 SAddr = Addr.getOperand(0);
1785 } else {
1786 SAddr = Addr;
1787 }
1788
1789 SAddr = SelectSAddrFI(CurDAG, SAddr);
1790
1791 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1792
1793 if (!TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS,
1795 int64_t SplitImmOffset, RemainderOffset;
1796 std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
1798
1799 COffsetVal = SplitImmOffset;
1800
1801 SDValue AddOffset =
1803 ? getMaterializedScalarImm32(Lo_32(RemainderOffset), DL)
1804 : CurDAG->getTargetConstant(RemainderOffset, DL, MVT::i32);
1805 SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, DL, MVT::i32,
1806 SAddr, AddOffset),
1807 0);
1808 }
1809
1810 Offset = CurDAG->getTargetConstant(COffsetVal, DL, MVT::i16);
1811
1812 return true;
1813}
1814
1815// Check whether the flat scratch SVS swizzle bug affects this access.
1816bool AMDGPUDAGToDAGISel::checkFlatScratchSVSSwizzleBug(
1817 SDValue VAddr, SDValue SAddr, uint64_t ImmOffset) const {
1818 if (!Subtarget->hasFlatScratchSVSSwizzleBug())
1819 return false;
1820
1821 // The bug affects the swizzling of SVS accesses if there is any carry out
1822 // from the two low order bits (i.e. from bit 1 into bit 2) when adding
1823 // voffset to (soffset + inst_offset).
1824 KnownBits VKnown = CurDAG->computeKnownBits(VAddr);
1826 true, false, CurDAG->computeKnownBits(SAddr),
1827 KnownBits::makeConstant(APInt(32, ImmOffset)));
1828 uint64_t VMax = VKnown.getMaxValue().getZExtValue();
1830 return (VMax & 3) + (SMax & 3) >= 4;
1831}
1832
1833bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
1834 SDValue &VAddr, SDValue &SAddr,
1835 SDValue &Offset) const {
1836 int64_t ImmOffset = 0;
1837
1838 SDValue LHS, RHS;
1839 if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
1840 int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
1841 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1842
1843 if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, true)) {
1844 Addr = LHS;
1845 ImmOffset = COffsetVal;
1846 } else if (!LHS->isDivergent() && COffsetVal > 0) {
1847 SDLoc SL(N);
1848 // saddr + large_offset -> saddr + (vaddr = large_offset & ~MaxOffset) +
1849 // (large_offset & MaxOffset);
1850 int64_t SplitImmOffset, RemainderOffset;
1851 std::tie(SplitImmOffset, RemainderOffset)
1852 = TII->splitFlatOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, true);
1853
1854 if (isUInt<32>(RemainderOffset)) {
1855 SDNode *VMov = CurDAG->getMachineNode(
1856 AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
1857 CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
1858 VAddr = SDValue(VMov, 0);
1859 SAddr = LHS;
1860 if (!isFlatScratchBaseLegal(SAddr) || !isFlatScratchBaseLegal(VAddr))
1861 return false;
1862 if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, SplitImmOffset))
1863 return false;
1864 Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i16);
1865 return true;
1866 }
1867 }
1868 }
1869
1870 if (Addr.getOpcode() != ISD::ADD)
1871 return false;
1872
1873 LHS = Addr.getOperand(0);
1874 RHS = Addr.getOperand(1);
1875
1876 if (!LHS->isDivergent() && RHS->isDivergent()) {
1877 SAddr = LHS;
1878 VAddr = RHS;
1879 } else if (!RHS->isDivergent() && LHS->isDivergent()) {
1880 SAddr = RHS;
1881 VAddr = LHS;
1882 } else {
1883 return false;
1884 }
1885
1886 if (!isFlatScratchBaseLegal(SAddr) || !isFlatScratchBaseLegal(VAddr))
1887 return false;
1888
1889 if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, ImmOffset))
1890 return false;
1891 SAddr = SelectSAddrFI(CurDAG, SAddr);
1892 Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16);
1893 return true;
1894}
1895
1896// Match an immediate (if Offset is not null) or an SGPR (if SOffset is
1897// not null) offset. If Imm32Only is true, match only 32-bit immediate
1898// offsets available on CI.
1899bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
1900 SDValue *SOffset, SDValue *Offset,
1901 bool Imm32Only, bool IsBuffer) const {
1902 assert((!SOffset || !Offset) &&
1903 "Cannot match both soffset and offset at the same time!");
1904
1905 ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode);
1906 if (!C) {
1907 if (!SOffset)
1908 return false;
1909 if (ByteOffsetNode.getValueType().isScalarInteger() &&
1910 ByteOffsetNode.getValueType().getSizeInBits() == 32) {
1911 *SOffset = ByteOffsetNode;
1912 return true;
1913 }
1914 if (ByteOffsetNode.getOpcode() == ISD::ZERO_EXTEND) {
1915 if (ByteOffsetNode.getOperand(0).getValueType().getSizeInBits() == 32) {
1916 *SOffset = ByteOffsetNode.getOperand(0);
1917 return true;
1918 }
1919 }
1920 return false;
1921 }
1922
1923 SDLoc SL(ByteOffsetNode);
1924
1925 // GFX9 and GFX10 have signed byte immediate offsets. The immediate
1926 // offset for S_BUFFER instructions is unsigned.
1927 int64_t ByteOffset = IsBuffer ? C->getZExtValue() : C->getSExtValue();
1928 std::optional<int64_t> EncodedOffset =
1929 AMDGPU::getSMRDEncodedOffset(*Subtarget, ByteOffset, IsBuffer);
1930 if (EncodedOffset && Offset && !Imm32Only) {
1931 *Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);
1932 return true;
1933 }
1934
1935 // SGPR and literal offsets are unsigned.
1936 if (ByteOffset < 0)
1937 return false;
1938
1939 EncodedOffset = AMDGPU::getSMRDEncodedLiteralOffset32(*Subtarget, ByteOffset);
1940 if (EncodedOffset && Offset && Imm32Only) {
1941 *Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);
1942 return true;
1943 }
1944
1945 if (!isUInt<32>(ByteOffset) && !isInt<32>(ByteOffset))
1946 return false;
1947
1948 if (SOffset) {
1949 SDValue C32Bit = CurDAG->getTargetConstant(ByteOffset, SL, MVT::i32);
1950 *SOffset = SDValue(
1951 CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, C32Bit), 0);
1952 return true;
1953 }
1954
1955 return false;
1956}
1957
1958SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const {
1959 if (Addr.getValueType() != MVT::i32)
1960 return Addr;
1961
1962 // Zero-extend a 32-bit address.
1963 SDLoc SL(Addr);
1964
1967 unsigned AddrHiVal = Info->get32BitAddressHighBits();
1968 SDValue AddrHi = CurDAG->getTargetConstant(AddrHiVal, SL, MVT::i32);
1969
1970 const SDValue Ops[] = {
1971 CurDAG->getTargetConstant(AMDGPU::SReg_64_XEXECRegClassID, SL, MVT::i32),
1972 Addr,
1973 CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
1974 SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, AddrHi),
1975 0),
1976 CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32),
1977 };
1978
1979 return SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, SL, MVT::i64,
1980 Ops), 0);
1981}
1982
1983// Match a base and an immediate (if Offset is not null) or an SGPR (if
1984// SOffset is not null) or an immediate+SGPR offset. If Imm32Only is
1985// true, match only 32-bit immediate offsets available on CI.
1986bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase,
1987 SDValue *SOffset, SDValue *Offset,
1988 bool Imm32Only,
1989 bool IsBuffer) const {
1990 if (SOffset && Offset) {
1991 assert(!Imm32Only && !IsBuffer);
1992 SDValue B;
1993 return SelectSMRDBaseOffset(Addr, B, nullptr, Offset) &&
1994 SelectSMRDBaseOffset(B, SBase, SOffset, nullptr);
1995 }
1996
1997 // A 32-bit (address + offset) should not cause unsigned 32-bit integer
1998 // wraparound, because s_load instructions perform the addition in 64 bits.
1999 if (Addr.getValueType() == MVT::i32 && Addr.getOpcode() == ISD::ADD &&
2000 !Addr->getFlags().hasNoUnsignedWrap())
2001 return false;
2002
2003 SDValue N0, N1;
2004 // Extract the base and offset if possible.
2005 if (CurDAG->isBaseWithConstantOffset(Addr) || Addr.getOpcode() == ISD::ADD) {
2006 N0 = Addr.getOperand(0);
2007 N1 = Addr.getOperand(1);
2008 } else if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, N0, N1)) {
2009 assert(N0 && N1 && isa<ConstantSDNode>(N1));
2010 }
2011 if (!N0 || !N1)
2012 return false;
2013 if (SelectSMRDOffset(N1, SOffset, Offset, Imm32Only, IsBuffer)) {
2014 SBase = N0;
2015 return true;
2016 }
2017 if (SelectSMRDOffset(N0, SOffset, Offset, Imm32Only, IsBuffer)) {
2018 SBase = N1;
2019 return true;
2020 }
2021 return false;
2022}
2023
2024bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase,
2025 SDValue *SOffset, SDValue *Offset,
2026 bool Imm32Only) const {
2027 if (SelectSMRDBaseOffset(Addr, SBase, SOffset, Offset, Imm32Only)) {
2028 SBase = Expand32BitAddress(SBase);
2029 return true;
2030 }
2031
2032 if (Addr.getValueType() == MVT::i32 && Offset && !SOffset) {
2033 SBase = Expand32BitAddress(Addr);
2034 *Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
2035 return true;
2036 }
2037
2038 return false;
2039}
2040
2041bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr, SDValue &SBase,
2042 SDValue &Offset) const {
2043 return SelectSMRD(Addr, SBase, /* SOffset */ nullptr, &Offset);
2044}
2045
2046bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase,
2047 SDValue &Offset) const {
2049 return SelectSMRD(Addr, SBase, /* SOffset */ nullptr, &Offset,
2050 /* Imm32Only */ true);
2051}
2052
2053bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDValue Addr, SDValue &SBase,
2054 SDValue &SOffset) const {
2055 return SelectSMRD(Addr, SBase, &SOffset, /* Offset */ nullptr);
2056}
2057
2058bool AMDGPUDAGToDAGISel::SelectSMRDSgprImm(SDValue Addr, SDValue &SBase,
2059 SDValue &SOffset,
2060 SDValue &Offset) const {
2061 return SelectSMRD(Addr, SBase, &SOffset, &Offset);
2062}
2063
2064bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue N, SDValue &Offset) const {
2065 return SelectSMRDOffset(N, /* SOffset */ nullptr, &Offset,
2066 /* Imm32Only */ false, /* IsBuffer */ true);
2067}
2068
2069bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue N,
2070 SDValue &Offset) const {
2072 return SelectSMRDOffset(N, /* SOffset */ nullptr, &Offset,
2073 /* Imm32Only */ true, /* IsBuffer */ true);
2074}
2075
2076bool AMDGPUDAGToDAGISel::SelectSMRDBufferSgprImm(SDValue N, SDValue &SOffset,
2077 SDValue &Offset) const {
2078 // Match the (soffset + offset) pair as a 32-bit register base and
2079 // an immediate offset.
2080 return N.getValueType() == MVT::i32 &&
2081 SelectSMRDBaseOffset(N, /* SBase */ SOffset, /* SOffset*/ nullptr,
2082 &Offset, /* Imm32Only */ false,
2083 /* IsBuffer */ true);
2084}
2085
2086bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index,
2087 SDValue &Base,
2088 SDValue &Offset) const {
2089 SDLoc DL(Index);
2090
2092 SDValue N0 = Index.getOperand(0);
2093 SDValue N1 = Index.getOperand(1);
2094 ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
2095
2096 // (add n0, c0)
2097 // Don't peel off the offset (c0) if doing so could possibly lead
2098 // the base (n0) to be negative.
2099 // (or n0, |c0|) can never change a sign given isBaseWithConstantOffset.
2100 if (C1->getSExtValue() <= 0 || CurDAG->SignBitIsZero(N0) ||
2101 (Index->getOpcode() == ISD::OR && C1->getSExtValue() >= 0)) {
2102 Base = N0;
2103 Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
2104 return true;
2105 }
2106 }
2107
2108 if (isa<ConstantSDNode>(Index))
2109 return false;
2110
2111 Base = Index;
2112 Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
2113 return true;
2114}
2115
2116SDNode *AMDGPUDAGToDAGISel::getBFE32(bool IsSigned, const SDLoc &DL,
2117 SDValue Val, uint32_t Offset,
2118 uint32_t Width) {
2119 if (Val->isDivergent()) {
2120 unsigned Opcode = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2122 SDValue W = CurDAG->getTargetConstant(Width, DL, MVT::i32);
2123
2124 return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, Off, W);
2125 }
2126 unsigned Opcode = IsSigned ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2127 // Transformation function, pack the offset and width of a BFE into
2128 // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
2129 // source, bits [5:0] contain the offset and bits [22:16] the width.
2130 uint32_t PackedVal = Offset | (Width << 16);
2131 SDValue PackedConst = CurDAG->getTargetConstant(PackedVal, DL, MVT::i32);
2132
2133 return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, PackedConst);
2134}
2135
2136void AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode *N) {
2137 // "(a << b) srl c)" ---> "BFE_U32 a, (c-b), (32-c)
2138 // "(a << b) sra c)" ---> "BFE_I32 a, (c-b), (32-c)
2139 // Predicate: 0 < b <= c < 32
2140
2141 const SDValue &Shl = N->getOperand(0);
2142 ConstantSDNode *B = dyn_cast<ConstantSDNode>(Shl->getOperand(1));
2143 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
2144
2145 if (B && C) {
2146 uint32_t BVal = B->getZExtValue();
2147 uint32_t CVal = C->getZExtValue();
2148
2149 if (0 < BVal && BVal <= CVal && CVal < 32) {
2150 bool Signed = N->getOpcode() == ISD::SRA;
2151 ReplaceNode(N, getBFE32(Signed, SDLoc(N), Shl.getOperand(0), CVal - BVal,
2152 32 - CVal));
2153 return;
2154 }
2155 }
2156 SelectCode(N);
2157}
2158
2159void AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) {
2160 switch (N->getOpcode()) {
2161 case ISD::AND:
2162 if (N->getOperand(0).getOpcode() == ISD::SRL) {
2163 // "(a srl b) & mask" ---> "BFE_U32 a, b, popcount(mask)"
2164 // Predicate: isMask(mask)
2165 const SDValue &Srl = N->getOperand(0);
2166 ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(Srl.getOperand(1));
2167 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(N->getOperand(1));
2168
2169 if (Shift && Mask) {
2170 uint32_t ShiftVal = Shift->getZExtValue();
2171 uint32_t MaskVal = Mask->getZExtValue();
2172
2173 if (isMask_32(MaskVal)) {
2174 uint32_t WidthVal = llvm::popcount(MaskVal);
2175 ReplaceNode(N, getBFE32(false, SDLoc(N), Srl.getOperand(0), ShiftVal,
2176 WidthVal));
2177 return;
2178 }
2179 }
2180 }
2181 break;
2182 case ISD::SRL:
2183 if (N->getOperand(0).getOpcode() == ISD::AND) {
2184 // "(a & mask) srl b)" ---> "BFE_U32 a, b, popcount(mask >> b)"
2185 // Predicate: isMask(mask >> b)
2186 const SDValue &And = N->getOperand(0);
2187 ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(N->getOperand(1));
2188 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(And->getOperand(1));
2189
2190 if (Shift && Mask) {
2191 uint32_t ShiftVal = Shift->getZExtValue();
2192 uint32_t MaskVal = Mask->getZExtValue() >> ShiftVal;
2193
2194 if (isMask_32(MaskVal)) {
2195 uint32_t WidthVal = llvm::popcount(MaskVal);
2196 ReplaceNode(N, getBFE32(false, SDLoc(N), And.getOperand(0), ShiftVal,
2197 WidthVal));
2198 return;
2199 }
2200 }
2201 } else if (N->getOperand(0).getOpcode() == ISD::SHL) {
2202 SelectS_BFEFromShifts(N);
2203 return;
2204 }
2205 break;
2206 case ISD::SRA:
2207 if (N->getOperand(0).getOpcode() == ISD::SHL) {
2208 SelectS_BFEFromShifts(N);
2209 return;
2210 }
2211 break;
2212
2214 // sext_inreg (srl x, 16), i8 -> bfe_i32 x, 16, 8
2215 SDValue Src = N->getOperand(0);
2216 if (Src.getOpcode() != ISD::SRL)
2217 break;
2218
2219 const ConstantSDNode *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(1));
2220 if (!Amt)
2221 break;
2222
2223 unsigned Width = cast<VTSDNode>(N->getOperand(1))->getVT().getSizeInBits();
2224 ReplaceNode(N, getBFE32(true, SDLoc(N), Src.getOperand(0),
2225 Amt->getZExtValue(), Width));
2226 return;
2227 }
2228 }
2229
2230 SelectCode(N);
2231}
2232
2233bool AMDGPUDAGToDAGISel::isCBranchSCC(const SDNode *N) const {
2234 assert(N->getOpcode() == ISD::BRCOND);
2235 if (!N->hasOneUse())
2236 return false;
2237
2238 SDValue Cond = N->getOperand(1);
2239 if (Cond.getOpcode() == ISD::CopyToReg)
2240 Cond = Cond.getOperand(2);
2241
2242 if (Cond.getOpcode() != ISD::SETCC || !Cond.hasOneUse())
2243 return false;
2244
2245 MVT VT = Cond.getOperand(0).getSimpleValueType();
2246 if (VT == MVT::i32)
2247 return true;
2248
2249 if (VT == MVT::i64) {
2250 auto ST = static_cast<const GCNSubtarget *>(Subtarget);
2251
2252 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
2253 return (CC == ISD::SETEQ || CC == ISD::SETNE) && ST->hasScalarCompareEq64();
2254 }
2255
2256 return false;
2257}
2258
2259void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
2260 SDValue Cond = N->getOperand(1);
2261
2262 if (Cond.isUndef()) {
2263 CurDAG->SelectNodeTo(N, AMDGPU::SI_BR_UNDEF, MVT::Other,
2264 N->getOperand(2), N->getOperand(0));
2265 return;
2266 }
2267
2268 const GCNSubtarget *ST = static_cast<const GCNSubtarget *>(Subtarget);
2269 const SIRegisterInfo *TRI = ST->getRegisterInfo();
2270
2271 bool UseSCCBr = isCBranchSCC(N) && isUniformBr(N);
2272 unsigned BrOp = UseSCCBr ? AMDGPU::S_CBRANCH_SCC1 : AMDGPU::S_CBRANCH_VCCNZ;
2273 Register CondReg = UseSCCBr ? AMDGPU::SCC : TRI->getVCC();
2274 SDLoc SL(N);
2275
2276 if (!UseSCCBr) {
2277 // This is the case that we are selecting to S_CBRANCH_VCCNZ. We have not
2278 // analyzed what generates the vcc value, so we do not know whether vcc
2279 // bits for disabled lanes are 0. Thus we need to mask out bits for
2280 // disabled lanes.
2281 //
2282 // For the case that we select S_CBRANCH_SCC1 and it gets
2283 // changed to S_CBRANCH_VCCNZ in SIFixSGPRCopies, SIFixSGPRCopies calls
2284 // SIInstrInfo::moveToVALU which inserts the S_AND).
2285 //
2286 // We could add an analysis of what generates the vcc value here and omit
2287 // the S_AND when is unnecessary. But it would be better to add a separate
2288 // pass after SIFixSGPRCopies to do the unnecessary S_AND removal, so it
2289 // catches both cases.
2290 Cond = SDValue(CurDAG->getMachineNode(ST->isWave32() ? AMDGPU::S_AND_B32
2291 : AMDGPU::S_AND_B64,
2292 SL, MVT::i1,
2293 CurDAG->getRegister(ST->isWave32() ? AMDGPU::EXEC_LO
2294 : AMDGPU::EXEC,
2295 MVT::i1),
2296 Cond),
2297 0);
2298 }
2299
2300 SDValue VCC = CurDAG->getCopyToReg(N->getOperand(0), SL, CondReg, Cond);
2301 CurDAG->SelectNodeTo(N, BrOp, MVT::Other,
2302 N->getOperand(2), // Basic Block
2303 VCC.getValue(0));
2304}
2305
2306void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode *N, unsigned IntrID) {
2307 // The address is assumed to be uniform, so if it ends up in a VGPR, it will
2308 // be copied to an SGPR with readfirstlane.
2309 unsigned Opc = IntrID == Intrinsic::amdgcn_ds_append ?
2310 AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
2311
2312 SDValue Chain = N->getOperand(0);
2313 SDValue Ptr = N->getOperand(2);
2314 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2315 MachineMemOperand *MMO = M->getMemOperand();
2316 bool IsGDS = M->getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
2317
2320 SDValue PtrBase = Ptr.getOperand(0);
2321 SDValue PtrOffset = Ptr.getOperand(1);
2322
2323 const APInt &OffsetVal = cast<ConstantSDNode>(PtrOffset)->getAPIntValue();
2324 if (isDSOffsetLegal(PtrBase, OffsetVal.getZExtValue())) {
2325 N = glueCopyToM0(N, PtrBase);
2326 Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i32);
2327 }
2328 }
2329
2330 if (!Offset) {
2331 N = glueCopyToM0(N, Ptr);
2332 Offset = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32);
2333 }
2334
2335 SDValue Ops[] = {
2336 Offset,
2337 CurDAG->getTargetConstant(IsGDS, SDLoc(), MVT::i32),
2338 Chain,
2339 N->getOperand(N->getNumOperands() - 1) // New glue
2340 };
2341
2342 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
2343 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
2344}
2345
2346// We need to handle this here because tablegen doesn't support matching
2347// instructions with multiple outputs.
2348void AMDGPUDAGToDAGISel::SelectDSBvhStackIntrinsic(SDNode *N) {
2349 unsigned Opc = AMDGPU::DS_BVH_STACK_RTN_B32;
2350 SDValue Ops[] = {N->getOperand(2), N->getOperand(3), N->getOperand(4),
2351 N->getOperand(5), N->getOperand(0)};
2352
2353 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2354 MachineMemOperand *MMO = M->getMemOperand();
2355 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
2356 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
2357}
2358
2359static unsigned gwsIntrinToOpcode(unsigned IntrID) {
2360 switch (IntrID) {
2361 case Intrinsic::amdgcn_ds_gws_init:
2362 return AMDGPU::DS_GWS_INIT;
2363 case Intrinsic::amdgcn_ds_gws_barrier:
2364 return AMDGPU::DS_GWS_BARRIER;
2365 case Intrinsic::amdgcn_ds_gws_sema_v:
2366 return AMDGPU::DS_GWS_SEMA_V;
2367 case Intrinsic::amdgcn_ds_gws_sema_br:
2368 return AMDGPU::DS_GWS_SEMA_BR;
2369 case Intrinsic::amdgcn_ds_gws_sema_p:
2370 return AMDGPU::DS_GWS_SEMA_P;
2371 case Intrinsic::amdgcn_ds_gws_sema_release_all:
2372 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
2373 default:
2374 llvm_unreachable("not a gws intrinsic");
2375 }
2376}
2377
2378void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) {
2379 if (!Subtarget->hasGWS() ||
2380 (IntrID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
2381 !Subtarget->hasGWSSemaReleaseAll())) {
2382 // Let this error.
2383 SelectCode(N);
2384 return;
2385 }
2386
2387 // Chain, intrinsic ID, vsrc, offset
2388 const bool HasVSrc = N->getNumOperands() == 4;
2389 assert(HasVSrc || N->getNumOperands() == 3);
2390
2391 SDLoc SL(N);
2392 SDValue BaseOffset = N->getOperand(HasVSrc ? 3 : 2);
2393 int ImmOffset = 0;
2394 MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2395 MachineMemOperand *MMO = M->getMemOperand();
2396
2397 // Don't worry if the offset ends up in a VGPR. Only one lane will have
2398 // effect, so SIFixSGPRCopies will validly insert readfirstlane.
2399
2400 // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
2401 // offset field) % 64. Some versions of the programming guide omit the m0
2402 // part, or claim it's from offset 0.
2403 if (ConstantSDNode *ConstOffset = dyn_cast<ConstantSDNode>(BaseOffset)) {
2404 // If we have a constant offset, try to use the 0 in m0 as the base.
2405 // TODO: Look into changing the default m0 initialization value. If the
2406 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
2407 // the immediate offset.
2408 glueCopyToM0(N, CurDAG->getTargetConstant(0, SL, MVT::i32));
2409 ImmOffset = ConstOffset->getZExtValue();
2410 } else {
2411 if (CurDAG->isBaseWithConstantOffset(BaseOffset)) {
2412 ImmOffset = BaseOffset.getConstantOperandVal(1);
2413 BaseOffset = BaseOffset.getOperand(0);
2414 }
2415
2416 // Prefer to do the shift in an SGPR since it should be possible to use m0
2417 // as the result directly. If it's already an SGPR, it will be eliminated
2418 // later.
2419 SDNode *SGPROffset
2420 = CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL, MVT::i32,
2421 BaseOffset);
2422 // Shift to offset in m0
2423 SDNode *M0Base
2424 = CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,
2425 SDValue(SGPROffset, 0),
2426 CurDAG->getTargetConstant(16, SL, MVT::i32));
2427 glueCopyToM0(N, SDValue(M0Base, 0));
2428 }
2429
2430 SDValue Chain = N->getOperand(0);
2431 SDValue OffsetField = CurDAG->getTargetConstant(ImmOffset, SL, MVT::i32);
2432
2433 const unsigned Opc = gwsIntrinToOpcode(IntrID);
2435 if (HasVSrc)
2436 Ops.push_back(N->getOperand(2));
2437 Ops.push_back(OffsetField);
2438 Ops.push_back(Chain);
2439
2440 SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
2441 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
2442}
2443
2444void AMDGPUDAGToDAGISel::SelectInterpP1F16(SDNode *N) {
2445 if (Subtarget->getLDSBankCount() != 16) {
2446 // This is a single instruction with a pattern.
2447 SelectCode(N);
2448 return;
2449 }
2450
2451 SDLoc DL(N);
2452
2453 // This requires 2 instructions. It is possible to write a pattern to support
2454 // this, but the generated isel emitter doesn't correctly deal with multiple
2455 // output instructions using the same physical register input. The copy to m0
2456 // is incorrectly placed before the second instruction.
2457 //
2458 // TODO: Match source modifiers.
2459 //
2460 // def : Pat <
2461 // (int_amdgcn_interp_p1_f16
2462 // (VOP3Mods f32:$src0, i32:$src0_modifiers),
2463 // (i32 timm:$attrchan), (i32 timm:$attr),
2464 // (i1 timm:$high), M0),
2465 // (V_INTERP_P1LV_F16 $src0_modifiers, VGPR_32:$src0, timm:$attr,
2466 // timm:$attrchan, 0,
2467 // (V_INTERP_MOV_F32 2, timm:$attr, timm:$attrchan), timm:$high)> {
2468 // let Predicates = [has16BankLDS];
2469 // }
2470
2471 // 16 bank LDS
2472 SDValue ToM0 = CurDAG->getCopyToReg(CurDAG->getEntryNode(), DL, AMDGPU::M0,
2473 N->getOperand(5), SDValue());
2474
2475 SDVTList VTs = CurDAG->getVTList(MVT::f32, MVT::Other);
2476
2477 SDNode *InterpMov =
2478 CurDAG->getMachineNode(AMDGPU::V_INTERP_MOV_F32, DL, VTs, {
2479 CurDAG->getTargetConstant(2, DL, MVT::i32), // P0
2480 N->getOperand(3), // Attr
2481 N->getOperand(2), // Attrchan
2482 ToM0.getValue(1) // In glue
2483 });
2484
2485 SDNode *InterpP1LV =
2486 CurDAG->getMachineNode(AMDGPU::V_INTERP_P1LV_F16, DL, MVT::f32, {
2487 CurDAG->getTargetConstant(0, DL, MVT::i32), // $src0_modifiers
2488 N->getOperand(1), // Src0
2489 N->getOperand(3), // Attr
2490 N->getOperand(2), // Attrchan
2491 CurDAG->getTargetConstant(0, DL, MVT::i32), // $src2_modifiers
2492 SDValue(InterpMov, 0), // Src2 - holds two f16 values selected by high
2493 N->getOperand(4), // high
2494 CurDAG->getTargetConstant(0, DL, MVT::i1), // $clamp
2495 CurDAG->getTargetConstant(0, DL, MVT::i32), // $omod
2496 SDValue(InterpMov, 1)
2497 });
2498
2499 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), SDValue(InterpP1LV, 0));
2500}
2501
2502void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
2503 unsigned IntrID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
2504 switch (IntrID) {
2505 case Intrinsic::amdgcn_ds_append:
2506 case Intrinsic::amdgcn_ds_consume: {
2507 if (N->getValueType(0) != MVT::i32)
2508 break;
2509 SelectDSAppendConsume(N, IntrID);
2510 return;
2511 }
2512 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2513 SelectDSBvhStackIntrinsic(N);
2514 return;
2515 }
2516
2517 SelectCode(N);
2518}
2519
2520void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) {
2521 unsigned IntrID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
2522 unsigned Opcode;
2523 switch (IntrID) {
2524 case Intrinsic::amdgcn_wqm:
2525 Opcode = AMDGPU::WQM;
2526 break;
2527 case Intrinsic::amdgcn_softwqm:
2528 Opcode = AMDGPU::SOFT_WQM;
2529 break;
2530 case Intrinsic::amdgcn_wwm:
2531 case Intrinsic::amdgcn_strict_wwm:
2532 Opcode = AMDGPU::STRICT_WWM;
2533 break;
2534 case Intrinsic::amdgcn_strict_wqm:
2535 Opcode = AMDGPU::STRICT_WQM;
2536 break;
2537 case Intrinsic::amdgcn_interp_p1_f16:
2538 SelectInterpP1F16(N);
2539 return;
2540 case Intrinsic::amdgcn_inverse_ballot:
2541 switch (N->getOperand(1).getValueSizeInBits()) {
2542 case 32:
2543 Opcode = AMDGPU::S_INVERSE_BALLOT_U32;
2544 break;
2545 case 64:
2546 Opcode = AMDGPU::S_INVERSE_BALLOT_U64;
2547 break;
2548 default:
2549 llvm_unreachable("Unsupported size for inverse ballot mask.");
2550 }
2551 break;
2552 default:
2553 SelectCode(N);
2554 return;
2555 }
2556
2557 SDValue Src = N->getOperand(1);
2558 CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), {Src});
2559}
2560
2561void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) {
2562 unsigned IntrID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
2563 switch (IntrID) {
2564 case Intrinsic::amdgcn_ds_gws_init:
2565 case Intrinsic::amdgcn_ds_gws_barrier:
2566 case Intrinsic::amdgcn_ds_gws_sema_v:
2567 case Intrinsic::amdgcn_ds_gws_sema_br:
2568 case Intrinsic::amdgcn_ds_gws_sema_p:
2569 case Intrinsic::amdgcn_ds_gws_sema_release_all:
2570 SelectDS_GWS(N, IntrID);
2571 return;
2572 default:
2573 break;
2574 }
2575
2576 SelectCode(N);
2577}
2578
2579void AMDGPUDAGToDAGISel::SelectWAVE_ADDRESS(SDNode *N) {
2580 SDValue Log2WaveSize =
2581 CurDAG->getTargetConstant(Subtarget->getWavefrontSizeLog2(), SDLoc(N), MVT::i32);
2582 CurDAG->SelectNodeTo(N, AMDGPU::S_LSHR_B32, N->getVTList(),
2583 {N->getOperand(0), Log2WaveSize});
2584}
2585
2586void AMDGPUDAGToDAGISel::SelectSTACKRESTORE(SDNode *N) {
2587 SDValue SrcVal = N->getOperand(1);
2588 if (SrcVal.getValueType() != MVT::i32) {
2589 SelectCode(N); // Emit default error
2590 return;
2591 }
2592
2593 SDValue CopyVal;
2595 SDLoc SL(N);
2596
2597 if (SrcVal.getOpcode() == AMDGPUISD::WAVE_ADDRESS) {
2598 CopyVal = SrcVal.getOperand(0);
2599 } else {
2600 SDValue Log2WaveSize = CurDAG->getTargetConstant(
2601 Subtarget->getWavefrontSizeLog2(), SL, MVT::i32);
2602
2603 if (N->isDivergent()) {
2604 SrcVal = SDValue(CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL,
2605 MVT::i32, SrcVal),
2606 0);
2607 }
2608
2609 CopyVal = SDValue(CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,
2610 {SrcVal, Log2WaveSize}),
2611 0);
2612 }
2613
2614 SDValue CopyToSP = CurDAG->getCopyToReg(N->getOperand(0), SL, SP, CopyVal);
2615 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), CopyToSP);
2616}
2617
2618bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src,
2619 unsigned &Mods,
2620 bool IsCanonicalizing,
2621 bool AllowAbs) const {
2622 Mods = SISrcMods::NONE;
2623 Src = In;
2624
2625 if (Src.getOpcode() == ISD::FNEG) {
2626 Mods |= SISrcMods::NEG;
2627 Src = Src.getOperand(0);
2628 } else if (Src.getOpcode() == ISD::FSUB && IsCanonicalizing) {
2629 // Fold fsub [+-]0 into fneg. This may not have folded depending on the
2630 // denormal mode, but we're implicitly canonicalizing in a source operand.
2631 auto *LHS = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
2632 if (LHS && LHS->isZero()) {
2633 Mods |= SISrcMods::NEG;
2634 Src = Src.getOperand(1);
2635 }
2636 }
2637
2638 if (AllowAbs && Src.getOpcode() == ISD::FABS) {
2639 Mods |= SISrcMods::ABS;
2640 Src = Src.getOperand(0);
2641 }
2642
2643 return true;
2644}
2645
2646bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src,
2647 SDValue &SrcMods) const {
2648 unsigned Mods;
2649 if (SelectVOP3ModsImpl(In, Src, Mods, /*IsCanonicalizing=*/true,
2650 /*AllowAbs=*/true)) {
2651 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2652 return true;
2653 }
2654
2655 return false;
2656}
2657
2658bool AMDGPUDAGToDAGISel::SelectVOP3ModsNonCanonicalizing(
2659 SDValue In, SDValue &Src, SDValue &SrcMods) const {
2660 unsigned Mods;
2661 if (SelectVOP3ModsImpl(In, Src, Mods, /*IsCanonicalizing=*/false,
2662 /*AllowAbs=*/true)) {
2663 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2664 return true;
2665 }
2666
2667 return false;
2668}
2669
2670bool AMDGPUDAGToDAGISel::SelectVOP3BMods(SDValue In, SDValue &Src,
2671 SDValue &SrcMods) const {
2672 unsigned Mods;
2673 if (SelectVOP3ModsImpl(In, Src, Mods,
2674 /*IsCanonicalizing=*/true,
2675 /*AllowAbs=*/false)) {
2676 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2677 return true;
2678 }
2679
2680 return false;
2681}
2682
2683bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src) const {
2684 if (In.getOpcode() == ISD::FABS || In.getOpcode() == ISD::FNEG)
2685 return false;
2686
2687 Src = In;
2688 return true;
2689}
2690
2691bool AMDGPUDAGToDAGISel::SelectVINTERPModsImpl(SDValue In, SDValue &Src,
2692 SDValue &SrcMods,
2693 bool OpSel) const {
2694 unsigned Mods;
2695 if (SelectVOP3ModsImpl(In, Src, Mods,
2696 /*IsCanonicalizing=*/true,
2697 /*AllowAbs=*/false)) {
2698 if (OpSel)
2699 Mods |= SISrcMods::OP_SEL_0;
2700 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2701 return true;
2702 }
2703
2704 return false;
2705}
2706
2707bool AMDGPUDAGToDAGISel::SelectVINTERPMods(SDValue In, SDValue &Src,
2708 SDValue &SrcMods) const {
2709 return SelectVINTERPModsImpl(In, Src, SrcMods, /* OpSel */ false);
2710}
2711
2712bool AMDGPUDAGToDAGISel::SelectVINTERPModsHi(SDValue In, SDValue &Src,
2713 SDValue &SrcMods) const {
2714 return SelectVINTERPModsImpl(In, Src, SrcMods, /* OpSel */ true);
2715}
2716
2717bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src,
2718 SDValue &SrcMods, SDValue &Clamp,
2719 SDValue &Omod) const {
2720 SDLoc DL(In);
2721 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
2722 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
2723
2724 return SelectVOP3Mods(In, Src, SrcMods);
2725}
2726
2727bool AMDGPUDAGToDAGISel::SelectVOP3BMods0(SDValue In, SDValue &Src,
2728 SDValue &SrcMods, SDValue &Clamp,
2729 SDValue &Omod) const {
2730 SDLoc DL(In);
2731 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
2732 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
2733
2734 return SelectVOP3BMods(In, Src, SrcMods);
2735}
2736
2737bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In, SDValue &Src,
2738 SDValue &Clamp, SDValue &Omod) const {
2739 Src = In;
2740
2741 SDLoc DL(In);
2742 Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
2743 Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
2744
2745 return true;
2746}
2747
2748bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
2749 SDValue &SrcMods, bool IsDOT) const {
2750 unsigned Mods = SISrcMods::NONE;
2751 Src = In;
2752
2753 // TODO: Handle G_FSUB 0 as fneg
2754 if (Src.getOpcode() == ISD::FNEG) {
2756 Src = Src.getOperand(0);
2757 }
2758
2759 if (Src.getOpcode() == ISD::BUILD_VECTOR && Src.getNumOperands() == 2 &&
2760 (!IsDOT || !Subtarget->hasDOTOpSelHazard())) {
2761 unsigned VecMods = Mods;
2762
2763 SDValue Lo = stripBitcast(Src.getOperand(0));
2764 SDValue Hi = stripBitcast(Src.getOperand(1));
2765
2766 if (Lo.getOpcode() == ISD::FNEG) {
2767 Lo = stripBitcast(Lo.getOperand(0));
2768 Mods ^= SISrcMods::NEG;
2769 }
2770
2771 if (Hi.getOpcode() == ISD::FNEG) {
2772 Hi = stripBitcast(Hi.getOperand(0));
2773 Mods ^= SISrcMods::NEG_HI;
2774 }
2775
2776 if (isExtractHiElt(Lo, Lo))
2777 Mods |= SISrcMods::OP_SEL_0;
2778
2779 if (isExtractHiElt(Hi, Hi))
2780 Mods |= SISrcMods::OP_SEL_1;
2781
2782 unsigned VecSize = Src.getValueSizeInBits();
2783 Lo = stripExtractLoElt(Lo);
2784 Hi = stripExtractLoElt(Hi);
2785
2786 if (Lo.getValueSizeInBits() > VecSize) {
2788 (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
2789 MVT::getIntegerVT(VecSize), Lo);
2790 }
2791
2792 if (Hi.getValueSizeInBits() > VecSize) {
2794 (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
2795 MVT::getIntegerVT(VecSize), Hi);
2796 }
2797
2798 assert(Lo.getValueSizeInBits() <= VecSize &&
2799 Hi.getValueSizeInBits() <= VecSize);
2800
2801 if (Lo == Hi && !isInlineImmediate(Lo.getNode())) {
2802 // Really a scalar input. Just select from the low half of the register to
2803 // avoid packing.
2804
2805 if (VecSize == 32 || VecSize == Lo.getValueSizeInBits()) {
2806 Src = Lo;
2807 } else {
2808 assert(Lo.getValueSizeInBits() == 32 && VecSize == 64);
2809
2810 SDLoc SL(In);
2812 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, SL,
2813 Lo.getValueType()), 0);
2814 auto RC = Lo->isDivergent() ? AMDGPU::VReg_64RegClassID
2815 : AMDGPU::SReg_64RegClassID;
2816 const SDValue Ops[] = {
2817 CurDAG->getTargetConstant(RC, SL, MVT::i32),
2818 Lo, CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
2819 Undef, CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32) };
2820
2821 Src = SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, SL,
2822 Src.getValueType(), Ops), 0);
2823 }
2824 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2825 return true;
2826 }
2827
2828 if (VecSize == 64 && Lo == Hi && isa<ConstantFPSDNode>(Lo)) {
2829 uint64_t Lit = cast<ConstantFPSDNode>(Lo)->getValueAPF()
2830 .bitcastToAPInt().getZExtValue();
2831 if (AMDGPU::isInlinableLiteral32(Lit, Subtarget->hasInv2PiInlineImm())) {
2832 Src = CurDAG->getTargetConstant(Lit, SDLoc(In), MVT::i64);
2833 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2834 return true;
2835 }
2836 }
2837
2838 Mods = VecMods;
2839 }
2840
2841 // Packed instructions do not have abs modifiers.
2842 Mods |= SISrcMods::OP_SEL_1;
2843
2844 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2845 return true;
2846}
2847
2848bool AMDGPUDAGToDAGISel::SelectVOP3PModsDOT(SDValue In, SDValue &Src,
2849 SDValue &SrcMods) const {
2850 return SelectVOP3PMods(In, Src, SrcMods, true);
2851}
2852
2853bool AMDGPUDAGToDAGISel::SelectDotIUVOP3PMods(SDValue In, SDValue &Src) const {
2854 const ConstantSDNode *C = cast<ConstantSDNode>(In);
2855 // Literal i1 value set in intrinsic, represents SrcMods for the next operand.
2856 // 1 promotes packed values to signed, 0 treats them as unsigned.
2857 assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value");
2858
2859 unsigned Mods = SISrcMods::OP_SEL_1;
2860 unsigned SrcSign = C->getZExtValue();
2861 if (SrcSign == 1)
2862 Mods ^= SISrcMods::NEG;
2863
2864 Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2865 return true;
2866}
2867
2868bool AMDGPUDAGToDAGISel::SelectWMMAOpSelVOP3PMods(SDValue In,
2869 SDValue &Src) const {
2870 const ConstantSDNode *C = cast<ConstantSDNode>(In);
2871 assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value");
2872
2873 unsigned Mods = SISrcMods::OP_SEL_1;
2874 unsigned SrcVal = C->getZExtValue();
2875 if (SrcVal == 1)
2876 Mods |= SISrcMods::OP_SEL_0;
2877
2878 Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2879 return true;
2880}
2881
2882bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src,
2883 SDValue &SrcMods) const {
2884 Src = In;
2885 // FIXME: Handle op_sel
2886 SrcMods = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32);
2887 return true;
2888}
2889
2890bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods(SDValue In, SDValue &Src,
2891 SDValue &SrcMods) const {
2892 // FIXME: Handle op_sel
2893 return SelectVOP3Mods(In, Src, SrcMods);
2894}
2895
2896// The return value is not whether the match is possible (which it always is),
2897// but whether or not it a conversion is really used.
2898bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src,
2899 unsigned &Mods) const {
2900 Mods = 0;
2901 SelectVOP3ModsImpl(In, Src, Mods);
2902
2903 if (Src.getOpcode() == ISD::FP_EXTEND) {
2904 Src = Src.getOperand(0);
2905 assert(Src.getValueType() == MVT::f16);
2906 Src = stripBitcast(Src);
2907
2908 // Be careful about folding modifiers if we already have an abs. fneg is
2909 // applied last, so we don't want to apply an earlier fneg.
2910 if ((Mods & SISrcMods::ABS) == 0) {
2911 unsigned ModsTmp;
2912 SelectVOP3ModsImpl(Src, Src, ModsTmp);
2913
2914 if ((ModsTmp & SISrcMods::NEG) != 0)
2915 Mods ^= SISrcMods::NEG;
2916
2917 if ((ModsTmp & SISrcMods::ABS) != 0)
2918 Mods |= SISrcMods::ABS;
2919 }
2920
2921 // op_sel/op_sel_hi decide the source type and source.
2922 // If the source's op_sel_hi is set, it indicates to do a conversion from fp16.
2923 // If the sources's op_sel is set, it picks the high half of the source
2924 // register.
2925
2926 Mods |= SISrcMods::OP_SEL_1;
2927 if (isExtractHiElt(Src, Src)) {
2928 Mods |= SISrcMods::OP_SEL_0;
2929
2930 // TODO: Should we try to look for neg/abs here?
2931 }
2932
2933 return true;
2934 }
2935
2936 return false;
2937}
2938
2939bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsExt(SDValue In, SDValue &Src,
2940 SDValue &SrcMods) const {
2941 unsigned Mods = 0;
2942 if (!SelectVOP3PMadMixModsImpl(In, Src, Mods))
2943 return false;
2944 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2945 return true;
2946}
2947
2948bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src,
2949 SDValue &SrcMods) const {
2950 unsigned Mods = 0;
2951 SelectVOP3PMadMixModsImpl(In, Src, Mods);
2952 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2953 return true;
2954}
2955
2956SDValue AMDGPUDAGToDAGISel::getHi16Elt(SDValue In) const {
2957 if (In.isUndef())
2958 return CurDAG->getUNDEF(MVT::i32);
2959
2960 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(In)) {
2961 SDLoc SL(In);
2962 return CurDAG->getConstant(C->getZExtValue() << 16, SL, MVT::i32);
2963 }
2964
2965 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(In)) {
2966 SDLoc SL(In);
2967 return CurDAG->getConstant(
2968 C->getValueAPF().bitcastToAPInt().getZExtValue() << 16, SL, MVT::i32);
2969 }
2970
2971 SDValue Src;
2972 if (isExtractHiElt(In, Src))
2973 return Src;
2974
2975 return SDValue();
2976}
2977
2978bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
2980
2981 const SIRegisterInfo *SIRI =
2982 static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
2983 const SIInstrInfo * SII =
2984 static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
2985
2986 unsigned Limit = 0;
2987 bool AllUsesAcceptSReg = true;
2988 for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
2989 Limit < 10 && U != E; ++U, ++Limit) {
2990 const TargetRegisterClass *RC = getOperandRegClass(*U, U.getOperandNo());
2991
2992 // If the register class is unknown, it could be an unknown
2993 // register class that needs to be an SGPR, e.g. an inline asm
2994 // constraint
2995 if (!RC || SIRI->isSGPRClass(RC))
2996 return false;
2997
2998 if (RC != &AMDGPU::VS_32RegClass) {
2999 AllUsesAcceptSReg = false;
3000 SDNode * User = *U;
3001 if (User->isMachineOpcode()) {
3002 unsigned Opc = User->getMachineOpcode();
3003 const MCInstrDesc &Desc = SII->get(Opc);
3004 if (Desc.isCommutable()) {
3005 unsigned OpIdx = Desc.getNumDefs() + U.getOperandNo();
3006 unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex;
3007 if (SII->findCommutedOpIndices(Desc, OpIdx, CommuteIdx1)) {
3008 unsigned CommutedOpNo = CommuteIdx1 - Desc.getNumDefs();
3009 const TargetRegisterClass *CommutedRC = getOperandRegClass(*U, CommutedOpNo);
3010 if (CommutedRC == &AMDGPU::VS_32RegClass)
3011 AllUsesAcceptSReg = true;
3012 }
3013 }
3014 }
3015 // If "AllUsesAcceptSReg == false" so far we haven't succeeded
3016 // commuting current user. This means have at least one use
3017 // that strictly require VGPR. Thus, we will not attempt to commute
3018 // other user instructions.
3019 if (!AllUsesAcceptSReg)
3020 break;
3021 }
3022 }
3023 return !AllUsesAcceptSReg && (Limit < 10);
3024}
3025
3026bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode * N) const {
3027 auto Ld = cast<LoadSDNode>(N);
3028
3029 if (N->isDivergent() && !AMDGPUInstrInfo::isUniformMMO(Ld->getMemOperand()))
3030 return false;
3031
3032 return Ld->getAlign() >= Align(4) &&
3033 ((Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
3034 Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ||
3035 (Subtarget->getScalarizeGlobalBehavior() &&
3036 Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
3037 Ld->isSimple() &&
3038 static_cast<const SITargetLowering *>(getTargetLowering())
3039 ->isMemOpHasNoClobberedMemOperand(N)));
3040}
3041
3044 *static_cast<const AMDGPUTargetLowering*>(getTargetLowering());
3045 bool IsModified = false;
3046 do {
3047 IsModified = false;
3048
3049 // Go over all selected nodes and try to fold them a bit more
3051 while (Position != CurDAG->allnodes_end()) {
3052 SDNode *Node = &*Position++;
3053 MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(Node);
3054 if (!MachineNode)
3055 continue;
3056
3057 SDNode *ResNode = Lowering.PostISelFolding(MachineNode, *CurDAG);
3058 if (ResNode != Node) {
3059 if (ResNode)
3060 ReplaceUses(Node, ResNode);
3061 IsModified = true;
3062 }
3063 }
3065 } while (IsModified);
3066}
3067
3068char AMDGPUDAGToDAGISel::ID = 0;
unsigned const MachineRegisterInfo * MRI
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool getBaseWithOffsetUsingSplitOR(SelectionDAG &DAG, SDValue Addr, SDValue &N0, SDValue &N1)
static SDValue matchZExtFromI32(SDValue Op)
static unsigned gwsIntrinToOpcode(unsigned IntrID)
static SDValue SelectSAddrFI(SelectionDAG *CurDAG, SDValue SAddr)
static MemSDNode * findMemSDNode(SDNode *N)
static bool IsCopyFromSGPR(const SIRegisterInfo &TRI, SDValue Val)
Defines an instruction selector for the AMDGPU target.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static unsigned gwsIntrinToOpcode(unsigned IntrID)
static MachineInstr * isExtractHiElt(MachineInstr *Inst, MachineRegisterInfo &MRI)
Provides AMDGPU specific target descriptions.
Base class for AMDGPU specific classes of TargetSubtarget.
The AMDGPU TargetMachine interface definition for hw codegen targets.
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Addr
uint64_t Size
unsigned const TargetRegisterInfo * TRI
const char LLVMTargetMachineRef TM
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:59
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
pre isel intrinsic Pre ISel Intrinsic Lowering
Provides R600 specific target descriptions.
Interface definition for R600RegisterInfo.
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
LLVM IR instance of the generic uniformity analysis.
Value * RHS
Value * LHS
AMDGPU specific code to select AMDGPU machine instructions for SelectionDAG operations.
void SelectBuildVector(SDNode *N, unsigned RegClassID)
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
void Select(SDNode *N) override
Main hook for targets to transform nodes into machine nodes.
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
void PreprocessISelDAG() override
PreprocessISelDAG - This hook allows targets to hack on the graph before instruction selection starts...
void PostprocessISelDAG() override
PostprocessISelDAG() - This hook allows the target to hack on the graph right after selection.
StringRef getPassName() const override
getPassName - Return a nice clean name for a pass.
AMDGPUDAGToDAGISel()=delete
bool matchLoadD16FromBuildVector(SDNode *N) const
static bool isUniformMMO(const MachineMemOperand *MMO)
unsigned getWavefrontSizeLog2() const
bool hasInv2PiInlineImm() const
static SDValue stripBitcast(SDValue Val)
static int64_t getNullPointerValue(unsigned AddrSpace)
Get the integer value of a null pointer in the given address space.
Class for arbitrary precision integers.
Definition: APInt.h:76
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:212
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1485
unsigned countr_one() const
Count the number of trailing one bits.
Definition: APInt.h:1600
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
LLVM Basic Block Representation.
Definition: BasicBlock.h:56
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:127
uint64_t getZExtValue() const
int64_t getSExtValue() const
This class represents an Operation in the Expression.
Legacy analysis pass which computes a DominatorTree.
Definition: Dominators.h:314
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:166
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:311
int getLDSBankCount() const
Definition: GCNSubtarget.h:301
bool hasUsableDSOffset() const
True if the offset field of DS instructions works as expected.
Definition: GCNSubtarget.h:427
bool unsafeDSOffsetFoldingEnabled() const
Definition: GCNSubtarget.h:431
bool hasFlatInstOffsets() const
Definition: GCNSubtarget.h:584
bool hasDLInsts() const
Definition: GCNSubtarget.h:707
const SIInstrInfo * getInstrInfo() const override
Definition: GCNSubtarget.h:231
unsigned getConstantBusLimit(unsigned Opcode) const
bool hasMADIntraFwdBug() const
Definition: GCNSubtarget.h:954
bool privateMemoryResourceIsRangeChecked() const
Definition: GCNSubtarget.h:512
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:243
bool hasDOTOpSelHazard() const
bool d16PreservesUnusedBits() const
Definition: GCNSubtarget.h:638
bool hasFlatSegmentOffsetBug() const
Definition: GCNSubtarget.h:626
bool getScalarizeGlobalBehavior() const
Definition: GCNSubtarget.h:846
bool ldsRequiresM0Init() const
Return if most LDS instructions have an m0 use that require m0 to be initialized.
Definition: GCNSubtarget.h:648
bool hasFlatScratchSVSSwizzleBug() const
bool hasGWS() const
bool useFlatForGlobal() const
Definition: GCNSubtarget.h:490
Generation getGeneration() const
Definition: GCNSubtarget.h:282
bool hasGWSSemaReleaseAll() const
Definition: GCNSubtarget.h:662
bool hasAddr64() const
Definition: GCNSubtarget.h:342
bool hasAddNoCarry() const
Definition: GCNSubtarget.h:670
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
SmallVector< LoopT *, 4 > getLoopsInPreorder() const
Return all of the loops in the function in preorder across the loop nests, with siblings in forward p...
The legacy pass manager's analysis pass to compute loop information.
Definition: LoopInfo.h:594
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
const Triple & getTargetTriple() const
Machine Value Type.
static MVT getIntegerVT(unsigned BitWidth)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
A description of a memory reference used in the backend.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
An SDNode that represents everything that will be needed to construct a MachineInstr.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getChain() const
EVT getMemoryVT() const
Return the type of the in-memory value.
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
This class provides iterator support for SDUse operands that use a specific SDNode.
Represents one node in the SelectionDAG.
bool isDivergent() const
const SDValue & getOperand(unsigned Num) const
bool isPredecessorOf(const SDNode *N) const
Return true if this node is a predecessor of N.
static use_iterator use_end()
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
const SDValue & getOperand(unsigned i) const
uint64_t getConstantOperandVal(unsigned i) const
unsigned getOpcode() const
bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx0, unsigned &SrcOpIdx1) const override
static bool isLegalMUBUFImmOffset(unsigned Imm)
Definition: SIInstrInfo.h:1198
static unsigned getMaxMUBUFImmOffset()
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
const TargetRegisterClass * getRegClass(unsigned RCID) const
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isSGPRClass(const TargetRegisterClass *RC)
SelectionDAGISel - This is the common base class used for SelectionDAG-based pattern-matching instruc...
std::unique_ptr< FunctionLoweringInfo > FuncInfo
const TargetLowering * TLI
MachineFunction * MF
const TargetInstrInfo * TII
void ReplaceUses(SDValue F, SDValue T)
ReplaceUses - replace all uses of the old node F with the use of the new node T.
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
void ReplaceNode(SDNode *F, SDNode *T)
Replace all uses of F with T, then remove F from the DAG.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const TargetLowering * getTargetLowering() const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:225
const TargetSubtargetInfo & getSubtarget() const
Definition: SelectionDAG.h:474
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
SDNode * SelectNodeTo(SDNode *N, unsigned MachineOpc, EVT VT)
These are used for target selectors to mutate the specified node to have the specified return type,...
SDNode * MorphNodeTo(SDNode *N, unsigned Opc, SDVTList VTs, ArrayRef< SDValue > Ops)
This mutates the specified node to have the specified return type, opcode, and operands.
allnodes_const_iterator allnodes_begin() const
Definition: SelectionDAG.h:531
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
allnodes_const_iterator allnodes_end() const
Definition: SelectionDAG.h:532
void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
SDValue getTargetFrameIndex(int FI, EVT VT)
Definition: SelectionDAG.h:725
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
bool SignBitIsZero(SDValue Op, unsigned Depth=0) const
Return true if the sign bit of Op is known to be zero.
SDValue getRegister(unsigned Reg, EVT VT)
void RemoveDeadNodes()
This method deletes all unreachable nodes in the SelectionDAG.
void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
const TargetMachine & getTarget() const
Definition: SelectionDAG.h:473
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, unsigned Reg, SDValue N)
Definition: SelectionDAG.h:771
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, uint64_t Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:674
bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:469
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition: SelectionDAG.h:554
void push_back(const T &Elt)
Definition: SmallVector.h:416
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1200
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
static const unsigned CommuteAnyOperandIndex
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:78
const Triple & getTargetTriple() const
unsigned getID() const
Return the register class ID number.
ArchType getArch() const
Get the parsed architecture type of this triple.
Definition: Triple.h:355
Legacy analysis pass which computes a CycleInfo.
LLVM Value Representation.
Definition: Value.h:74
Iterator for intrusive lists based on ilist_node.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
Definition: AMDGPU.h:385
@ REGION_ADDRESS
Address space for region memory. (GDS)
Definition: AMDGPU.h:379
@ LOCAL_ADDRESS
Address space for local memory.
Definition: AMDGPU.h:382
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
Definition: AMDGPU.h:381
@ FLAT_ADDRESS
Address space for flat memory.
Definition: AMDGPU.h:377
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
Definition: AMDGPU.h:378
@ PRIVATE_ADDRESS
Address space for private memory.
Definition: AMDGPU.h:383
@ CLAMP
CLAMP value between 0.0 and 1.0.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
std::optional< int64_t > getSMRDEncodedLiteralOffset32(const MCSubtargetInfo &ST, int64_t ByteOffset)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
std::optional< int64_t > getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset, bool IsBuffer)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:119
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:750
@ STACKRESTORE
STACKRESTORE has two operands, an input chain and a pointer to restore to it returns an output chain.
Definition: ISDOpcodes.h:1121
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:250
@ ConstantFP
Definition: ISDOpcodes.h:77
@ ADDC
Carry-setting nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:269
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition: ISDOpcodes.h:487
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:239
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1026
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:483
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition: ISDOpcodes.h:199
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:787
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:390
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:900
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition: ISDOpcodes.h:229
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
Definition: ISDOpcodes.h:934
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition: ISDOpcodes.h:620
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:925
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition: ISDOpcodes.h:500
@ UNDEF
UNDEF - An undefined node.
Definition: ISDOpcodes.h:211
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition: ISDOpcodes.h:208
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:705
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:535
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:203
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:777
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:966
@ TargetFrameIndex
Definition: ISDOpcodes.h:166
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:795
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:885
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:303
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:680
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:184
@ ADDE
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:279
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:866
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:783
@ BRCOND
BRCOND - Conditional branch.
Definition: ISDOpcodes.h:1065
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:192
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition: ISDOpcodes.h:515
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1503
@ Undef
Value of the register doesn't matter.
Reg
All possible values of the reg field in the ModR/M byte.
constexpr const char32_t SBase
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Offset
Definition: DWP.cpp:440
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:349
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
constexpr bool isMask_32(uint32_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition: MathExtras.h:240
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:136
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
CodeGenOptLevel
Code generation optimization level.
Definition: CodeGen.h:54
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:141
FunctionPass * createAMDGPUISelDag(TargetMachine &TM, CodeGenOptLevel OptLevel)
This pass converts a legalized DAG into a AMDGPU-specific.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ Add
Sum of integers.
unsigned M0(unsigned Val)
Definition: VE.h:375
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Description of the encoding of one expression Op.
Extended Value Type.
Definition: ValueTypes.h:34
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:351
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:363
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition: ValueTypes.h:239
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:311
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:149
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:319
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition: KnownBits.h:292
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition: KnownBits.h:136
static KnownBits computeForAddSub(bool Add, bool NSW, const KnownBits &LHS, KnownBits RHS)
Compute known bits resulting from adding LHS and RHS.
Definition: KnownBits.cpp:57
static unsigned getSubRegFromChannel(unsigned Channel)
This represents a list of ValueType's that has been intern'd by a SelectionDAG.